/* Distributed Checksum Clearinghouse * * compute fuzzy body checksum #1 * * --C-LICENSE-- * $Revision: 1.74 $ */ #include "dcc_ck.h" #define MAX_FUZ1_LEN (4*1024) /* Process more of a character reference * -1=need more * 0=character in cref->result and c is the terminal ';' * 1=character in cref->result but stop short before c */ int ck_cref(CK_CREF *cref, u_char uc) { u_int n; switch (cref->st) { case CREF_ST_IDLE: cref->st = CREF_ST_START; return -1; case CREF_ST_START: /* start to get a HTML character reference. We have already * passed the '&' and need '#' or a letter*/ if (uc == '#') { cref->st = CREF_ST_NUM; } else if (DCC_IS_LOWER(uc) || DCC_IS_UPPER(uc)) { CK_WORD_CLEAR(&cref->w); cref->w.b[0] = uc; cref->len = 1; cref->st = CREF_ST_NAME; } else { /* Give up on an invalid character reference. */ cref->st = CREF_ST_IDLE; return -1; } break; case CREF_ST_NUM: /* we've seen "&#". look for 'x' or the first digit */ if (uc == 'x' || uc == 'X') { cref->result = 0; cref->st = CREF_ST_HEX; } else if (uc >= '0' && uc <= '9') { cref->result = uc - '0'; cref->st = CREF_ST_DEC; } else { cref->result = '#'; cref->st = CREF_ST_IDLE; return 1; } break; case CREF_ST_DEC: if (uc >= '0' && uc <= '9') { n = cref->result*10 + (uc - '0'); } else { cref->st = CREF_ST_IDLE; return (uc != ';'); } if (n > 255) n = 255; cref->result = n; break; case CREF_ST_HEX: if ((uc >= 'a' && uc <= 'f') || (uc >= 'A' && uc <= 'F')) { n = (cref->result<<4) + (uc & 0xf) + 9; } else if (uc >= '0' && uc <= '9') { n = (cref->result<<4) + (uc - '0'); } else { cref->st = CREF_ST_IDLE; return (uc != ';'); } if (n > 255) n = 255; cref->result = n; break; case CREF_ST_NAME: if (DCC_IS_LOWER(uc) || DCC_IS_UPPER(uc) || DCC_IS_DIGIT(uc)) { /* If the word is too long, the final match will fail. * This will consume the word safely. */ if (cref->len < sizeof(CK_WORD)) cref->w.b[cref->len++] = uc; } else { /* this character ends the cref */ cref->result = lookup_cref(&cref->w, cref->len); cref->st = CREF_ST_IDLE; return (uc != ';'); } break; } /* accept this character of the character reference and wait for more */ return -1; } #define UTF8_MAX 0x10ffff #define PUNY_PREFIX "xn--" #define PUNY_PREFIX_LEN LITZ(PUNY_PREFIX) #define PUNY_BASE 36 #define PUNY_TMIN 1 #define PUNY_TMAX 26 #define PUNY_SKEW 38 #define PUNY_DAMP 700 #define PUNY_INITIAL_BIAS 72 #define PUNY_INITIAL_N 0x80 #define PUNYCODE36 "abcdefghijklmnopqrstuvwxyz0123456789" static inline int puny_adapt(u_int delta, u_int numpoints, u_char firsttime) { int k; if (firsttime) { delta /= PUNY_DAMP; } else { delta /= 2; } delta += (delta / numpoints); for (k = 0; delta > ((PUNY_BASE-PUNY_TMIN) * PUNY_TMAX) / 2; k += PUNY_BASE) { delta /= (PUNY_BASE - PUNY_TMIN); } return k + ((PUNY_BASE-PUNY_TMIN+1) * delta) / (delta+PUNY_SKEW); } /* encode Punycode for one label, probably expanding the label in the buffer */ static void punyencode(CK_URL *url) { # define PUNY_OUT(_c) do {if (puny_len >= URL_LABEL_MAX) return; \ puny[puny_len++] = (_c);} while (0); u_char puny[URL_LABEL_MAX]; u_int32_t unicode[URL_LABEL_MAX]; u_int unicode_len; u_int idx, n, delta, bias, h, b; int puny_len; u_int32_t uc, uc2, uc3, uc4; int i; url->flags &= ~URL_IDN; /* decode UTF-8 */ i = url->label_start; unicode_len = 0; do { uc = (u_char)url->doms_buf[i++]; if ((uc & 0x80) != 0) { if ((uc & 0xe0) == 0xc0) { uc &= 0x1f; /* give up on an invalid UTF-8 sequence */ if (uc == 0 || i >= url->dom_end) return; uc2 = (u_char)url->doms_buf[i++] ^ 0x80; if (uc2 > 0x3f) return; uc = (uc << 6) | uc2; } else if ((uc & 0xf0) == 0xe0) { uc &= 0x0f; if (uc == 0 || i+1 >= url->dom_end) return; uc2 = (u_char)url->doms_buf[i++] ^ 0x80; uc3 = (u_char)url->doms_buf[i++] ^ 0x80; if (uc2 > 0x3f || uc3 > 0x3f) return; uc = (((uc << 6) | uc2) << 6) | uc3; } else if ((uc & 0xf8) == 0xf0) { uc &= 0x07; if (uc == 0 || i+2 >= url->dom_end) return; uc2 = (u_char)url->doms_buf[i++] ^ 0x80; uc3 = (u_char)url->doms_buf[i++] ^ 0x80; uc4 = (u_char)url->doms_buf[i++] ^ 0x80; if (uc2 > 0x3f || uc3 > 0x3f || uc4 > 0x3f) return; uc = (((((uc << 6) | uc2) << 6) | uc3) << 6) | uc4; } else { return; } if (uc > UTF8_MAX) return; } if (unicode_len >= URL_LABEL_MAX) return; unicode[unicode_len++] = uc; } while (i != url->dom_end); /* Puny encode */ puny_len = PUNY_PREFIX_LEN; memcpy(puny, PUNY_PREFIX, puny_len); n = PUNY_INITIAL_N; delta = 0; bias = PUNY_INITIAL_BIAS; b = 0; /* copy basic code points */ for (idx = 0; idx < unicode_len; ++idx) { uc = unicode[idx]; if (uc >= 0x80) continue; PUNY_OUT(uc); ++b; } if (b != 0) PUNY_OUT('-'); h = b; while (h < unicode_len) { u_int m; m = UTF8_MAX+1; for (idx = 0; idx <= unicode_len; ++idx) { if (unicode[idx] < m && unicode[idx] >= n) m = unicode[idx]; } delta += (m - n) * (h + 1); n = m; for (idx = 0; idx < unicode_len; ++idx) { if (unicode[idx] < n) { ++delta; } else if (unicode[idx] == n) { u_int k, q; q = delta; for (k = PUNY_BASE; ; k += PUNY_BASE) { u_int t; t = (k <= bias ? PUNY_TMIN : k >= bias + PUNY_TMAX ? PUNY_TMAX : k - bias); if (q < t) break; PUNY_OUT(PUNYCODE36[t + (q-t) % (PUNY_BASE-t)]); q = (q-t) / (PUNY_BASE-t); } PUNY_OUT(PUNYCODE36[q]); bias = puny_adapt(delta, h+1, h == b); delta = 0; ++h; } } ++delta; ++n; } if (url->label_start + puny_len <= URL_HOST_MAX) { url->dom_end = url->label_start + puny_len; memcpy(&url->doms_buf[url->label_start], puny, puny_len); } } static inline void reset_doms_buf(CK_URL *url, char *url_start) { url->start = url_start; url->tld = 0; url->sld = 0; url->dom_end = url->dom_start; url->label_start = url->dom_end; url->punct = 0; url->dot1 = 0; url->dot2 = 0; url->url_total = 0; url->flags &= ~(URL_IDN | URL_DOT); } /* save a byte of the host name */ static inline void url_host_save(CK_URL *url, char uc) { if (url->dom_end - url->dom_start >= URL_HOST_MAX) { reset_doms_buf(url, 0); url->flags |= URL_TOO_LONG; url->st = URL_ST_IDLE; } if (url->flags & URL_TOO_LONG) return; url->doms_buf[url->dom_end++] = uc; } /* DNSBL check the host name from a URL */ static inline void url_host_end(GOT_CKS *cks) { int len; CK_URL *url = &cks->url; /* forget port number */ if (url->punct != 0) url->dom_end = url->punct; if (url->dom_end <= url->dom_start || (url->flags & URL_TOO_LONG)) return; if (url->flags & URL_IDN) punyencode(url); url->doms_buf[url->dom_end++] = '\0'; url_dnsbl(cks->dlw, &url->doms_buf[url->dom_start]); /* Delete 3rd level domain labels for the fuzzy checksums */ if (url->dot2 != 0) { if (cks->fuz1.eol == 0) { cks->fuz1.cp = url->start; } else { /* Move the end-of-line position if it exists. */ len = cks->fuz1.cp - url->sld; memmove(url->start, url->sld, len); if (cks->fuz1.eol >= cks->fuz1.cp) cks->fuz1.eol = url->start + len; cks->fuz1.cp = url->start + len; } /* trim the name for the fuz2 checksum only if * we might use it */ if (cks->fuz2.lang[0].wtotal < FUZ2_MIN_WORDS) { len = url->dom_end - url->dot2; memmove(&url->doms_buf[url->dom_start], &url->doms_buf[url->dot2], len); url->dom_end = url->dom_start + len; } } url->dom_start = url->dom_end; ++url->doms; } /* capture host name and skip parts of URLs * We need * parts of host names for fuz1 checksums * some host names sometimes for fuz2 checksum * all host names for DNS white- and blacklist checks */ static URL_RES ck_url(GOT_CKS *cks, u_char uc, /* lower case character */ const char **pbp) /* uc in input buffer */ { CK_URL *url; int i; url = &cks->url; if (uc == '&' || url->cref.st != CREF_ST_IDLE) { i = ck_cref(&url->cref, uc); if (i < 0) return URL_RES_FWD; *pbp -= i; uc = url->cref.result; } switch (url->st) { case URL_ST_IDLE: if (uc == 'h') { /* start looking for 't' after 'h' in "http" */ url->st = URL_ST_T1; url->flags = 0; } else if (uc == '=') { /* maybe found the '=' in "href=" "img src=" etc */ url->st = URL_ST_QUOTE; url->flags = 0; } break; case URL_ST_QUOTE: /* look for '"' '\'' or 'H' after "href=", "img src=, etc. * track quotes to help the FUZ1 checksum skip all of the end * of a URL */ if (uc == 'h') { url->st = URL_ST_T1; } else if (uc == '"') { url->flags |= URL_DQUOTED; url->st = URL_ST_QH; } else if (uc == '\'') { url->flags = URL_SQUOTED; url->st |= URL_ST_QH; } else { url->st = URL_ST_IDLE; } break; case URL_ST_QH: /* seen quote; looking for start of URL * These states should ignore whitespace including '\n' among * 'https://', but fixing that error would break checksum * compatibility, albeit on very few URLs. */ if (uc == 'h') { url->st = URL_ST_T1; } else { url->st = URL_ST_IDLE; } break; case URL_ST_T1: if (uc == 't') url->st = URL_ST_T2; else url->st = URL_ST_IDLE; break; case URL_ST_T2: if (uc == 't') url->st = URL_ST_P; else url->st = URL_ST_IDLE; break; case URL_ST_P: if (uc == 'p') url->st = URL_ST_S; else url->st = URL_ST_IDLE; break; case URL_ST_S: /* we are expecting the ':' or 's' after http */ if (uc == 's') url->st = URL_ST_COLON; else if (uc == ':') url->st = URL_ST_SLASH1; else url->st = URL_ST_IDLE; break; case URL_ST_COLON: /* we are expecting the ':' after http or https */ if (uc == ':') url->st = URL_ST_SLASH1; else url->st = URL_ST_IDLE; break; case URL_ST_SLASH1: if (uc == '/') { /* got the first '/' after http: */ url->st = URL_ST_SLASH2; } else { url->st = URL_ST_IDLE; } break; case URL_ST_SLASH2: if (uc == '/') { /* Got the second '/' after http:/" */ /* Make room in the host name buffer if we are too * close to its end for a maximum sized name. * Flush the buffer if we will not use the old names * for the fuz2 checksum. Otherwise discard the first * name in the buffer. */ if (cks->fuz2.lang[0].wtotal >= FUZ2_MIN_WORDS) { url->dom_start = 0; } else { while (url->dom_start > (ISZ(url->doms_buf) - URL_HOST_MAX_SAVE)) { char *p; p = memchr(url->doms_buf, '\0', url->dom_start); if (!p) { /* there must be >1 name in the * buffer because it is large * enough for at least 2 */ dcc_logbad(EX_SOFTWARE, "bad url->doms_buf"); } url->dom_start -= (++p - url->doms_buf); memmove(url->doms_buf, p, url->dom_start); } } url->st = URL_ST_SLASH3_START; /* tell the FUZ1 code to check for room in its buffer */ return URL_RES_CK_SPACE; } url->st = URL_ST_IDLE; break; case URL_ST_SLASH3_START: reset_doms_buf(url, cks->fuz1.cp); url->st = URL_ST_SLASH3; /* fall through */ case URL_ST_SLASH3: /* look for the end of the host name */ if (++url->url_total > URL_FAILSAFE) { url->st = URL_ST_IDLE; break; } if (!((uc >= 'a' && uc <= 'z') || (uc >= '0' && uc <= '9') || uc == '-' || uc == '_')) { if (uc == '.' && url->punct == 0) { if (url->flags & URL_DOT) { /* consecutive '.' are invalid in * host names, but can end URLs */ url->punct = url->dom_end; break; } /* delay dealing with '.' until we are sure * it is not at the end of the name */ url->flags |= URL_DOT; break; } /* treat '\\' like '/' for Microsoft */ if (uc == '/' || uc == '\\') { url_host_end(cks); url->st = URL_ST_SKIPPING_URL; break; } if (uc == '"') { url_host_end(cks); if (url->flags & URL_SQUOTED) url->st = URL_ST_SKIPPING_URL; else url->st = URL_ST_IDLE; break; } if (uc == '\'') { url_host_end(cks); if ((url->flags & URL_DQUOTED)) url->st = URL_ST_SKIPPING_URL; else url->st = URL_ST_IDLE; break; } /* Whitespace ends the URL for old versions of Firefox * and cannot be in a host name, user name, or * password. * Ignore the fact that newer versions and the standard * ignores whitespace in quoted URLs. Fixing this * would break checksum compatibility on very few URLs. * We must keep nulls out of URLs. * So assume all control characters end things.*/ if (uc == '<' || uc == '>' || uc <= ' ' || (uc == 0xa0 && !(url->flags & URL_IDN))) { url_host_end(cks); if (url->flags & URL_QUOTED) url->st = URL_ST_SKIPPING_URL; else url->st = URL_ST_IDLE; break; } /* ignore username and password */ if (uc == '@' && !(url->flags & URL_USERNAME)) { url->flags |= URL_USERNAME; cks->fuz1.cp = url->start; url->st = URL_ST_SLASH3_START; break; } /* Note characters that can be in RFC 1738 user names * and passwords but not host names * or a port number after a host name. * RFC 1738 said * login = [ user [ ":" password ] "@" ] hostport * user = *[ uchar | ";" | "?" | "&" | "=" ] * password = *[ uchar | ";" | "?" | "&" | "=" ] * uchar = unreserved | escape * unreserved= alpha | digit | safe | extra * safe = "$" | "-" | "_" | "." | "+" * extra = "!" | "*" | "'" | "(" | ")" | "," */ if (uc == ':' || ((uc == ';' || uc == '?' || uc == '&' || uc == '=' || uc == '$' || uc == '+' || uc == '+' || uc == '!' || uc == '*' || uc == '(' || uc == ')' || uc == ',') && !(url->flags & URL_USERNAME))) { if (url->punct == 0) url->punct = url->dom_end; } else if ((uc & 0xc0) == 0xc0 && url->punct == 0) { /* possible UTF8 for Puny encoding */ url->flags |= URL_IDN; } else { /* other strangeness probably ends the URL and * certainly the host name */ url_host_end(cks); if (url->flags & URL_QUOTED) url->st = URL_ST_SKIPPING_URL; else url->st = URL_ST_IDLE; break; } } /* Save this byte of the URL for the fuz2 checksum * and DNSBL checks unless we might be in a password */ if (url->punct == 0) { if (url->flags & URL_DOT) { /* first deal with a delayed '.' */ url->flags &= ~URL_DOT; if (url->flags & URL_IDN) punyencode(url); url_host_save(url, '.'); url->dot2 = url->dot1; url->dot1 = url->dom_end; url->label_start = url->dom_end; url->sld = url->tld; url->tld = cks->fuz1.cp; } url_host_save(url, uc); } break; case URL_ST_SKIPPING_URL: /* skip the rest of the URL */ ++url->url_total; if ((uc == '"' && !(url->flags & URL_SQUOTED)) || (uc == '\'' && !(url->flags & URL_DQUOTED)) || (uc == '>' && !(url->flags & URL_QUOTED)) || uc == ' ' || uc == '\t' || uc == '\n' || uc == '\r' || (uc == 0xA0 && !(url->flags & URL_IDN)) || url->url_total > URL_FAILSAFE) url->st = URL_ST_IDLE; return URL_RES_SKIP; } return URL_RES_FWD; } void dcc_ck_fuz1_init(GOT_CKS *cks) { cks->sums[DCC_CK_FUZ1].type = DCC_CK_FUZ1; cks->fuz1.total = 0; /* bytes summed */ cks->fuz1.cp = cks->fuz1.buf; cks->fuz1.eol = 0; cks->url.st = URL_ST_IDLE; cks->url.cref.st = CREF_ST_IDLE; MD5Init(&cks->fuz1.md5); } static inline u_char /* 0=keep the line, 1=discard it */ dear_sucker(const char *cp, u_int llen) { #define DEAR_WORD(w) (llen >= sizeof(w) && !strncmp(cp, w, LITZ(w))) if (DEAR_WORD("dear")) return 1; if (DEAR_WORD("hello")) return 1; if (DEAR_WORD("greeting")) return 1; if (DEAR_WORD("date")) return 1; return 0; #undef DEAR_WORD } static void add_sum(CTX_FUZ1 *fz1, int len) { int i; if (len == 0) return; /* limit fuz1 span */ i = MAX_FUZ1_LEN - (fz1->total + len); if (i < 0) len += i; if (len == 0) return; MD5Update(&fz1->md5, (void *)fz1->buf, len); fz1->total += len; } void dcc_ck_fuz1(GOT_CKS *cks, const char *bp, u_int bp_len) { #define CLEAN_FUZ1_BUF const char *bp_lim; int len; URL_RES url_res; char *cp, c; if (cks->sums[DCC_CK_FUZ1].type != DCC_CK_FUZ1) return; bp_lim = bp+bp_len; cp = cks->fuz1.cp; for (;;) { if (bp >= bp_lim) { /* Sum the buffer if it ends with a line. * Every message always ends with an artificial "\n". */ if (cp != cks->fuz1.buf && cks->fuz1.eol == cp) { add_sum(&cks->fuz1, cp - cks->fuz1.buf); cp = cks->fuz1.buf; cks->fuz1.eol = 0; #ifdef CLEAN_FUZ1_BUF memset(cp, 0, sizeof(cks->fuz1.buf)); #endif } cks->fuz1.cp = cp; return; } c = *bp++; /* convert ASCII upper to lower case */ c = DCC_TO_LOWER(c); /* Look for start of a URL. Do not worry about HTML character * references because Thunderbird does not recognize * asdf * or Hhttp://domain.com */ if (c == 'h' || c == '=' || cks->url.st != URL_ST_IDLE) { cks->fuz1.cp = cp; url_res = ck_url(cks, c, &bp); cp = cks->fuz1.cp; switch (url_res) { case URL_RES_FWD: /* add character to buffer for eventual sum */ break; case URL_RES_CK_SPACE: /* Make room before starting a URL * if we are too close to the end of * the buffer for the most that we will * save and then checksum of a URL. */ while (cp >= &cks->fuz1.buf[sizeof( cks->fuz1.buf) - URL_HOST_MAX]) { if (!cks->fuz1.eol) { add_sum(&cks->fuz1, cp - cks->fuz1.buf); cp = cks->fuz1.buf; /* no more EOL */ cks->fuz1.eol = 0; } else { len = cks->fuz1.eol - cks->fuz1.buf; add_sum(&cks->fuz1, len); memmove(cks->fuz1.buf, cks->fuz1.eol, cp - cks->fuz1.eol); cp -= len; cks->fuz1.eol = 0; } #ifdef CLEAN_FUZ1_BUF memset(cp, 0, sizeof(cks->fuz1.buf) - (cp-cks->fuz1.buf)); #endif } break; case URL_RES_SKIP: continue; } } /* collect only ASCII letters */ if (DCC_IS_LOWER(c)) { /* Collect more of a new line */ *cp = c; if (++cp < &cks->fuz1.buf[sizeof(cks->fuz1.buf)]) continue; /* We are at the end of the buffer, * so add it to the checksum */ add_sum(&cks->fuz1, cp - cks->fuz1.buf); cp = cks->fuz1.buf; cks->fuz1.eol = 0; /* no more EOL */ #ifdef CLEAN_FUZ1_BUF memset(cp, 0, sizeof(cks->fuz1.buf)); #endif cks->url.st = URL_ST_IDLE; continue; } if (c == '\n') { /* Ignore short lines starting with some strings * This should not be done if cks->url.st=URL_ST_SLASH3 * or cks->url_st=SLASH3_START_START, but fixing that * would break checksum compatibility, albeit on * very few URLS. */ if (cks->fuz1.eol && (len = cp - cks->fuz1.eol) > 0 && len <= FUZ1_MAX_LINE && dear_sucker(cks->fuz1.eol, len)) { cp = cks->fuz1.eol; cks->url.st = URL_ST_IDLE; continue; } /* Add the line to the checksum if we do not * have room in the buffer for another line */ if (cp >= &cks->fuz1.buf[sizeof(cks->fuz1.buf) - (FUZ1_MAX_LINE + HTTPS_LEN)]) { add_sum(&cks->fuz1, cp - cks->fuz1.buf); cp = cks->fuz1.buf; cks->fuz1.eol = 0; /* no more EOL */ #ifdef CLEAN_FUZ1_BUF memset(cp, 0, sizeof(cks->fuz1.buf)); #endif cks->url.st = URL_ST_IDLE; continue; } cks->fuz1.eol = cp; } } } void dcc_ck_fuz1_fin(GOT_CKS *cks) { if (cks->sums[DCC_CK_FUZ1].type != DCC_CK_FUZ1) return; /* we cannot compute a checksum on an empty or nearly empty message */ if (cks->fuz1.total < 30) { cks->sums[DCC_CK_FUZ1].type = DCC_CK_INVALID; return; } MD5Final(cks->sums[DCC_CK_FUZ1].sum.b, &cks->fuz1.md5); cks->sums[DCC_CK_FUZ1].rpt2srvr = 1; }