`
goodluck_wgw
  • 浏览: 95395 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

基于hadoop平台的pig语言对apache日志系统的分析

阅读更多
pig脚本如下
register myudfs.jar;
DEFINE DayExtractor org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd hh:mm:ss');
log = LOAD '/user/input/test/hp_analytics.ifensi.com-access_log.log' USING PigStorage() as (l1:chararray,l2:chararray,l3:chararray,l4:chararray,l5:chararray,l6:chararray,l7:chararray,l8:chararray,l9:chararray,l10:chararray);
log1 = FOREACH log GENERATE FLATTEN(STRSPLIT(l1, '\\,', 2))as (ip,otherargs),SUBSTRING(l4,1,21) as date,FLATTEN(REGEX_EXTRACT_ALL(l5,'\\"[^ ]* ([^ ]*) [^\\"]*\\"')) as url,FLATTEN(REGEX_EXTRACT_ALL(l8,'.{1}(.*).{1}')) as referer,FLATTEN(REGEX_EXTRACT_ALL(l9,'.{1}(.*).{1}')) as useragent,FLATTEN(REGEX_EXTRACT_ALL(l10,'.{1}(.*).{1}')) as vuid;
log2 = FILTER log1 BY SUBSTRING(vuid, 0, 4)=='vuid';
log3 = FOREACH log2 GENERATE ip, myudfs.DateExtractor(date) as date, FLATTEN(STRSPLIT(url, '\\?', 2)) AS (cmd, args), referer, useragent,FLATTEN(REGEX_EXTRACT_ALL(vuid,'.{5}(.*)')) as vuid;
SPLIT log3 INTO ihm IF cmd=='/__ihm.gif', ia IF cmd=='/__ia.gif';
-- ia process block
log4 = FOREACH ia GENERATE vuid, ip, FLATTEN(STRSPLIT(date, '\\|', 2)) AS (date, time), FLATTEN(REGEX_EXTRACT_ALL(args,'version=([^&]*)&(.*)')) as (ia_version, ia_other), referer,useragent;

SPLIT log4 INTO ia_version1 IF ia_version == '1.0', ia_version2 IF ia_version == '1.1';

log5 = FOREACH ia_version1 GENERATE vuid, ip,date,time, FLATTEN(REGEX_EXTRACT_ALL(ia_other,'browser=([^&]*)&browser_version=([^&]*)&operation_system=([^&]*)&operation_system_version=([^&]*)&flash_version=([^&]*)&java_enabled=([^&]*)&language=([^&]*)&screen_colors=([^&]*)&screen_resolution=([^&]*)&referrer=([^&]*)&tourl=([^&]*)&vuid=([^&]*)')) AS (ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_vuid), referer, useragent;
result1  = FOREACH log5 GENERATE vuid,ip,date,time,ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_vuid,referer,useragent;
STORE result1 INTO '/test/output/data/ia/ia_version1' USING PigStorage();

log6 = FOREACH ia_version2 GENERATE vuid, ip,date,time, FLATTEN(REGEX_EXTRACT_ALL(ia_other,'browser=([^&]*)&browser_version=([^&]*)&operation_system=([^&]*)&operation_system_version=([^&]*)&flash_version=([^&]*)&java_enabled=([^&]*)&language=([^&]*)&screen_colors=([^&]*)&screen_resolution=([^&]*)&referrer=([^&]*)&tourl=([^&]*)&title=([^&]*)&vuid=([^&]*)&muid=([^&]*)&mfid=([^&]*)&musername=([^&]*)&memail=([^&]*)')) AS (ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_title,ia_vuid,ia_muid,ia_mfid,ia_musername,ia_memail), referer, useragent;
result2  = FOREACH log6 GENERATE vuid,ip,date,time,ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_title,ia_vuid,referer,useragent,ia_muid,ia_mfid,ia_musername,ia_memail;
STORE result2 INTO '/test/output/data/ia/ia_version2' USING PigStorage();
-- ihm process block
ihm1 = FOREACH ihm GENERATE vuid,ip,FLATTEN(STRSPLIT(date, '\\|', 2)) AS (date, time),FLATTEN(REGEX_EXTRACT_ALL(args,'version=([^&]*)&(.*)')) as (ihm_version, ihm_other),referer,useragent;
ihm2 = FOREACH ihm1 GENERATE vuid,ip,date,time,ihm_version,FLATTEN(REGEX_EXTRACT_ALL(ihm_other,'vuid=([^&]*)&url=([^&]*)&width=([^&]*)&x=([^&]*)&y=(.*)')) as (ihm_vuid,ihm_url,ihm_width,ihm_x,ihm_y),referer,useragent;
ihm3 = FOREACH ihm2 GENERATE vuid,ip,date,time,ihm_vuid,ihm_url,ihm_x,ihm_y,ihm_width,referer,useragent;
STORE ihm3 INTO '/test/output/data/ihm' USING PigStorage();
附件为部分日志文件
分享到:
评论
1 楼 chenzhong 2012-01-06  

相关推荐

Global site tag (gtag.js) - Google Analytics