summaryrefslogtreecommitdiff
path: root/src/sanitize.py
blob: 36cd2107a8a5dcb3a3547e0e348efbdeab517a26 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/python
""" Sanitize Apache web logs by removing all potentially sensitive parts.

The following sanitizing steps are performed on data read from stdin:
  1. Die if a line is not in the Apache2 Combined Log Format.
  2. Die if other hosts than '0.0.0.0' or '0.0.0.1' are specified.
  3. Discard all lines with other methods than GET.
  4. Die if a protocol other than HTTP is used.
  5. Discard all lines with status code 400 and 404.
  6. Override client with '-'.
  7. Override user with '-'.
  8. Override time with '00:00:00 +0000'.
  9. Override referer (sic!) with '"-"'.
 10. Override user agent with '"-"'.
 11. Truncate resource at the first '?' character.
 12. Die if a valid date wasn't passed as the sole cmdline parameter.
 13. Die if a line has a date not equal or one day prior.

USAGE: sanitize.py INPUT_FILE OUTPUT_DIR

The main operation is to parse Apache web log files from INPUT_FILE and
output the sanitized version in files according to the date of the log
entry in OUTPUT_DIR. All discarded lines are output on stderr. A nonzero
exit code indicates an error during processing, error messages go to
stderr.

The input filename is expected to be in the following format:
    <hostname>.torproject.org-access.log-YYYYMMDD
"""

from __future__ import print_function

import re
import fileinput
import sys
import dateutil.parser
import datetime

assert(len(sys.argv) == 3)

# Extract date from filename
date = re.compile(r'[^0-9]*([0-9]{8})')
matched = date.match(sys.argv[1])
if matched is None:
    print("Could not extract date from", sys.argv[1], file=sys.stderr)
    sys.exit(1)
today = dateutil.parser.parse(matched.group(1))

is_valid_regex = re.compile(r'^0\.0\.0\.([01]) - - \[(\d{2}/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{4}):00:00:00 \+0000\] "([^ ]*) ([^ ?]*[?]?|).* HTTP([^"]*)" (-|\d*) (-|\d*) "([^\"]|\\|\")*" "([^"]|\")*" .*[^ ]$')
sanitized_regex = r'0.0.0.\1 - - [\2:00:00:00 +0000] "\4 \5 HTTP\6" \7 \8 "-" "-" -\n'
day_before = today - datetime.timedelta(days=1)

today_fname = sys.argv[2] + "/" + sys.argv[1] + "_sanitized"
yesterday_fname = today_fname.replace(matched.group(1), day_before.strftime("%Y%m%d"))

with open(yesterday_fname, 'a') as file_old:
    with open(today_fname, 'a') as file_new:
        for line in fileinput.input(sys.argv[1]):
            matched = is_valid_regex.match(line)
            if matched is None:
                #print(line, "Last line does not match criteria", file=sys.stderr)
                continue
            date = dateutil.parser.parse(matched.group(2))
            if today != date and day_before != date:
                print(line, "Last line does not match date constraints. today:", today,
                        " day before:", day_before, " date:", date, file=sys.stderr)
                continue
            requesttype = matched.group(4)
            if requesttype != "GET" and requesttype != "HEAD":
                #print(matched.expand(sanitized_regex), file=sys.stderr, end="")
                continue

            if matched.group(7) == "404" or matched.group(7) == "400":
                #print(matched.expand(sanitized_regex), file=sys.stderr, end="")
                continue

            # metrics weblogs don't start lines with 0.0.0.0 :(
            #if matched.group(1) == "0":
                #print(matched.expand(sanitized_regex), file=sys.stderr, end="")
            #    continue

            if today == date:
                file_new.write(matched.expand(sanitized_regex))
            else:
                file_old.write(matched.expand(sanitized_regex))

print(yesterday_fname)