#! /bin/sh # This script returns information on search queries that lead to your site (i.e. what keywords were used) # This script assumes the referer info is part of the access_log file in APACHE format. # # Put this script in ~/bin/ and run it every time you backup you log file (here named ~/logs/access_log.1) # If the referer file is separate from the log file, # you might want to join them both with the command "paste -d=" " access.log referer.log >access_log.1" # # This program can also run on Windows with cygwin and activeperl # For more explanations, look at http://www.gdargaud.net/Hack/Searches.html date # List of the various search engine query strings, separated by \| Queries='q=\|qry=\|query=\|search=\|ask=\|Term=\|Topic=\|pt=\|wf,\|prev=\|back=\|str=\|Keywords=\|qkw=\|kwd=\|p=\|searchfor=' # List of referers to ignore. # Should contain your own website name and aliases as well as "-", separated by \| Avoid='"-"\|gdargaud.net\|sung3\|rome.atmos' # Lets make a list of queries coming from remote pages to use in various analysis grep "$Queries" ~/logs/access_log.1 | perl -p -e 's/%(..)/pack("c", hex($1))/eg' | sed -e "s/.*GET \(.*\) HTTP.*\($Queries\)\([^&, ]*\).*/\3 -> \1/" -e "s/cache:[^+ ]*+//" | perl -p -e 's/%(..)/pack("c", hex($1))/eg' | sed -e 's/"//g' -e "s/+/ /g" | tr -s " " " " | sed -e "s/^ //" >/tmp/tmp$$ # You can reorder the following sections the way you like echo echo "++++++++ New referers +++++++++++++++++++++++++++++++++++++++++++++++++++++++++" gunzip ~/bin/Referers.txt.gz grep -v "$Queries" ~/logs/access_log.1 | cut -d" " -f11 | grep -v '?' | grep -v $Avoid | grep -aivxFf ~/bin/Referers.txt | sort -bf | uniq -ci | sort -nrf # Update the list of past searches grep -v "$Queries" ~/logs/access_log.1 | cut -d" " -f11 | grep -v '?' | grep -v $Avoid >> ~/bin/Referers.txt grep -avE "^.{1,5}$" ~/bin/Referers.txt | sort -bf | uniq -i >/tmp/tmp$$_ mv /tmp/tmp$$_ ~/bin/Referers.txt gzip ~/bin/Referers.txt echo echo "++++++++ Most popular referers (static pages) ++++++++++++++++++++++++++++++++++" # cat ~/logs/access_log.1 | grep -v "$Queries" ~/logs/access_log.1 | cut -d" " -f11 | grep -v '?' | grep -v $Avoid | sort -bf | uniq -ci | sort -nrf echo echo "++++++++ Most popular search engines +++++++++++++++++++++++++++++++++++++++++++" grep "$Queries" ~/logs/access_log.1 | cut -d" " -f11 | cut -d/ -f3 | sort -i | uniq -ci | sort -nrf echo echo "++++++++ Most popular referers (blogs, non search engines) +++++++++++++++++++++" # cat ~/logs/access_log.1 | grep -v "$Queries" ~/logs/access_log.1 | cut -d" " -f11 | grep '?' | grep -v $Avoid | sort -i | uniq -ci | sort -nrf echo echo "++++++++ Sort destinations +++++++++++++++++++++++++++++++++++++++++++++++++++++" grep -a " -> " /tmp/tmp$$ | sed -e "s/.* -> //" | sort -bf | uniq -ci | sort -nrf echo echo "++++++++ Search phrases ++++++++++++++++++++++++++++++++++++++++++++++++++++++++" grep -a " -> " /tmp/tmp$$ | sed -e "s/ -> .*//" | sort -bf | uniq -ci | sort -nrf echo echo "++++++++ Search phrases and their destinations +++++++++++++++++++++++++++++++++" # sort -bf /tmp/tmp$$ | uniq -ci echo echo "++++++++ Search keywords only ++++++++++++++++++++++++++++++++++++++++++++++++++" # grep -a " -> " /tmp/tmp$$ | # sed -e "s/ -> .*//" | # tr "?" " " | tr " " "\n" | # grep -ave "^.$" | grep -ave "^..$" | # sort -bf | uniq -ci | sort -nrf echo echo "++++++++ Leftovers +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" grep -av " -> " /tmp/tmp$$ echo echo "++++++++ New phrases +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" gunzip ~/bin/PhraseList.txt.gz grep -a " -> " /tmp/tmp$$ | sed -e "s/ -> .*//" | grep -aivxFf ~/bin/PhraseList.txt | sort -bf | uniq -ci | sort -rf # Update the list of past searches grep -a " -> " /tmp/tmp$$ | sed -e "s/ -> .*//" >> ~/bin/PhraseList.txt grep -avE "^.{1,3}$" ~/bin/PhraseList.txt | sort -bf | uniq -i >/tmp/tmp$$_ mv /tmp/tmp$$_ ~/bin/PhraseList.txt gzip ~/bin/PhraseList.txt rm /tmp/tmp$$* echo echo "++++++++++++++++++++++++++++++++++++++++" date