#!/bin/sh report() { if [ x != x$DEBUG ]; then echo -en `date +%Y/%m/%d\ %H:%M:%S` "\t" 1>&2 echo $* 1>&2 fi } # used to benchmark a command (1=title, 2=cmd) bench() { report "started $1" STIME=`date +%s` eval $2 FTIME=`date +%s` TIME=$(($FTIME - $STIME)) report "finished $1, took $TIME seconds" } # split the log up into seperate files for each alias WORKFILE=/tmp/access_log.ends.01May05 WORKDIR=/tmp ALIASES=/tmp/aliases LOGNAME=$WORKDIR/log splitlog() { awk '{print $2}' <$ALIASES>$ALIASES.$$ while read i; do set -- $i; URI=$7 # extract URI component while [ -n "$URI" ]; do URI=${URI%/*} # /foo/bar/baz -> /foo/bar # log_catchall for non-alias log info if [ -z "$URI" ]; then echo catchall $i else MATCH=`grep -Fc "^$URI\$" $ALIASES.$$` if [ "x1" = "x$MATCH" ]; then echo $URI $i break fi fi done done < $WORKFILE | awk \ "{ gsub(\"/\", \"_\", \$1); # remove slashes dest=sprintf(\"%s%s\", \"$WORKDIR/log\", \$1); # prepend $WORKDIR sub(\$1\" \", \"\"); # delete first column print >> dest # append to correct logfile }" } bench "splitting logs" splitlog # speed increases: # with a 1000 line logfile and 444 aliases # default 61 seconds # one awk call 52 seconds # grep -c 39 seconds # --mmap 40 seconds! # abuse set 40 seconds.. # !dirname 32 seconds # awk/sort 28 seconds # remove sort 28 seconds # fix catchall 24 seconds # remove tr 20 seconds # -grep+fgrep 83 seconds! :( # awk aliases 20 seconds # grep -F 18 seconds