--- webalizer-2.20-01/webalizer.c.org 2008-07-13 07:07:34.000000000 +0900 +++ webalizer-2.20-01/webalizer.c 2008-11-22 01:42:17.000000000 +0900 @@ -36,6 +36,7 @@ #include #include #include +#include /* ensure getopt */ #ifdef HAVE_GETOPT_H @@ -248,6 +249,11 @@ char *f_cp=f_buf+GZ_BUFSIZE; /* pointer into the buffer */ int f_end=0; /* count to end of buffer */ +#ifndef ICONV_DEST_CODE +#define ICONV_DEST_CODE "EUC-JP" +#endif +iconv_t cd_from_sjis, cd_from_eucj, cd_from_utf8; + char hit_color[] = "#00805c"; /* graph hit color */ char file_color[] = "#0040ff"; /* graph file color */ char site_color[] = "#ff8000"; /* graph site color */ @@ -666,6 +672,10 @@ /* get processing start time */ start_time = time(NULL); + cd_from_sjis = iconv_open(ICONV_DEST_CODE, "Shift_JIS"); + cd_from_eucj = iconv_open(ICONV_DEST_CODE, "EUC-JP"); + cd_from_utf8 = iconv_open(ICONV_DEST_CODE, "UTF-8"); + /*********************************************/ /* MAIN PROCESS LOOP - read through log file */ /*********************************************/ @@ -1482,6 +1492,10 @@ if (geo_fp) GeoIP_delete(geo_fp); #endif + iconv_close(cd_from_sjis); + iconv_close(cd_from_eucj); + iconv_close(cd_from_utf8); + /* Whew, all done! Exit with completion status (0) */ exit(0); } @@ -2084,6 +2098,22 @@ if (!str) return NULL; /* make sure strings valid */ + while(*cp1){ /* for escape code of apache log. */ + if(*cp1 == '\\' && *(cp1+1) == 'x' && + isxdigit(*(cp1+2)) && isxdigit(*(cp1+3))){ + *cp2 = from_hex(*(cp1+2))*16 + from_hex(*(cp1+3)); + if ((*cp2<32)||(*cp2==127)) *cp2='_'; + cp1+=4; cp2++; + } + else if(*cp1 == '\\' && *(cp1+1) == '\\'){ + *cp2++ = '\\'; + cp1 += 2; + } + else *cp2++ = *cp1++; + } + *cp2 = *cp1; + + cp1 = cp2 = str; while (*cp1) { if (*cp1=='%') /* Found an escape? */ @@ -2116,6 +2146,115 @@ if (*str1==0) return 0; else return 1; } +int score_eucj(unsigned char *str) +{ + int state = 0; + int score = 0; + int bad = 0; + if(str == NULL) return -1; + + for(; *str != 0; str++){ + switch(state){ + case 0: + if(*str >= 0x20 && *str <= 0x7e) score++; // ASCII + else if(*str >= 0xa1 && *str <= 0xfe) state = 1; // KANJI(1) + else if(*str == 0x8f); // HOJYO KANJI + else if(*str == 0x8e) state = 2; // HalfWidth KANA(1) + else if(*str < 0x20); // CTRL + else bad = 1; + break; + case 1: + if(*str >= 0xa1 && *str <= 0xfe) score += 2; //KANJI(2) + else bad = 1; + state = 0; + break; + case 2: + if(*str >= 0xa1 && *str <= 0xdf); // HalfWidth KANA(2) + else bad = 1; + state = 0; + break; + } + } + if(bad != 0) score = -1; + return score; +} + +int score_sjis(unsigned char *str) +{ + int state = 0; + int score = 0; + int bad = 0; + if(str == NULL) return -1; + + for(; *str != 0; str++){ + switch(state){ + case 0: + if(*str >= 0x20 && *str <= 0x7e) score++; // ASCII + else if((*str >= 0x81 && *str <= 0x9f) || + (*str >= 0xe0 && *str <= 0xfc)) state = 1; // SJIS(1) + else if(*str >= 0xa1 && *str <= 0xdf); // HalfWidth KANA + else if(*str < 0x20); // CTRL + else bad = 1; + break; + case 1: + if((*str >= 0x40 && *str <= 0x7e) || + (*str >= 0x80 && *str <= 0xfc)) score += 2; // SJIS(2) + else bad = 1; + state = 0; + break; + } + } + if(bad != 0) score = -1; + return score; +} + +int score_utf8(unsigned char *str) +{ + int state = 0; + int score = 0; + int bad = 0; + if(str == NULL) return -1; + + for(; *str != 0; str++){ + switch(state){ + case 0: + if(*str >= 0x20 && *str <= 0x7e) score++; + else if(*str >= 0xc0 && *str <= 0xdf) state = 1; // greek etc. + else if(*str >= 0xe0 && *str <= 0xef) state = 2; // KANJI etc. + else if(*str >= 0xf0 && *str <= 0xf7) state = 4; + else if(*str < 0x20); // CTRL + else bad = 1; + break; + case 1: + if(*str >= 0x80 && *str <= 0xbf) score++; + else bad = 1; + state = 0; + break; + case 2: + if(*str >= 0x80 && *str <= 0xbf) state = 3; // KANJI(2) + else {bad = 1; state = 0; } + break; + case 3: + if(*str >= 0x80 && *str <= 0xbf) score += 3; // KANJI(3) + else bad = 1; + state = 0; + break; + case 4: + case 5: + if(*str >= 0x80 && *str < 0xbf) state++; + else {bad = 1; state = 0;} + break; + case 6: + if(*str >= 0x80 && *str < 0xbf) score += 4; + else bad = 1; + state = 0; + break; + } + } + if(bad != 0) score = -1; + return score; +} + /*********************************************/ /* SRCH_STRING - get search strings from ref */ /*********************************************/ @@ -2127,6 +2266,11 @@ char srch[80]=""; unsigned char *cp1, *cp2, *cps; int sp_flg=0; + int sjis, eucj, utf8; + char tmpbuf2[BUFSIZE]; + size_t inlen, outlen; + unsigned char *cp3; + iconv_t cd; /* Check if search engine referrer or return */ if ( (cps=(unsigned char *)isinglist(search_list,log_rec.refer))==NULL) @@ -2151,9 +2295,7 @@ if (*cp1=='+') *cp1=' '; /* change + to space */ if (sp_flg && *cp1==' ') { cp1++; continue; } /* compress spaces */ if (*cp1==' ') sp_flg=1; else sp_flg=0; /* (flag spaces here) */ - if (searchcasei) - *cp2++=tolower(*cp1++); /* normal character */ - else *cp2++=*cp1++; + *cp2++=*cp1++; } } *cp2=0; cp2=(unsigned char *)tmpbuf; @@ -2165,9 +2307,30 @@ cp1=cp2+strlen((char *)cp2)-1; while (cp1!=cp2) if (isspace((unsigned char)*cp1)) *cp1--='\0'; else break; + utf8 = score_utf8(cp2); + sjis = score_sjis(cp2); + eucj = score_eucj(cp2); + if(utf8 >= sjis && utf8 >= eucj) cd = cd_from_utf8; + else if(sjis > utf8 && sjis > eucj) cd = cd_from_sjis; + else cd = cd_from_eucj; + iconv(cd, NULL, 0, NULL, 0); + cp3 = cp2; + inlen = strlen(cp2)+1; + cp1 = tmpbuf2; + outlen = sizeof(tmpbuf2); + if(iconv(cd, (char **)&cp3, &inlen, (char **)&cp1, &outlen) >= 0 && + inlen == 0){ + cp2 = tmpbuf2; + } + else for(cp1 = cp2; *cp1 != 0; cp1++) if(*cp1 >= 0x80) *cp1 = '_'; + /* strip invalid chars */ cp1=cp2; - while (*cp1!=0) { if ((*cp1<32)||(*cp1==127)) *cp1='_'; cp1++; } + while (*cp1!=0) { + if ((*cp1<32)||(*cp1==127)) *cp1='_'; + if (searchcasei) *cp1=tolower(*cp1); /* normal character */ + cp1++; + } if (put_snode((char *)cp2,(u_int64_t)1,sr_htab)) {