From d90c13daa49cdb3ec1a3c17baf6d0c200bbde5c8 Mon Sep 17 00:00:00 2001 From: Mark Sapiro Date: Sat, 24 Nov 2012 16:48:13 -0800 Subject: Added sitemap script from Tomasz Chmielewski to contrib/. --- contrib/sitemap | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 contrib/sitemap (limited to 'contrib/sitemap') diff --git a/contrib/sitemap b/contrib/sitemap new file mode 100644 index 00000000..34414de2 --- /dev/null +++ b/contrib/sitemap @@ -0,0 +1,86 @@ +#!/bin/bash + +# Simple sitemap.xml generator for Mailman + +# URL where your lists sit +SITEURL=http://lists.wpkg.org/pipermail + +# path to your mailman archives/private +MAILMANPATH=/path/to/htdocs/archives/private + +# lists we want to process +LISTS="debian-non-standard iodine-users sheepdog sheepdog-users stgt wpkg-announce wpkg-users" + +# path to the sitemap.xml.gz file (gzipped) +XMLSITEMAP=/path/to/htdocs/cgi-bin/sitemap.xml.gz + +# No need to change anything below +set -u + +# find html files with their dates +URLS="" +for LIST in $LISTS; do + URLS="$URLS"$'\n'$(find $MAILMANPATH/$LIST/ -type f -name \*html | xargs ls --time-style=long-iso -l | awk '{print $6"T"$7":00+00:00 "$8}' | grep -v attachments) +done + +# if the article is crawled once a month, it should be enough +MONTHLYLIST=$(echo "$URLS" | egrep -v '(author.html|date.html|index.html|subject.html|thread.html)') + +# indexes should be crawled daily. We'll set them to monthly later on, if they are old +DAILYLIST=$(echo "$URLS" | egrep '(author.html|date.html|index.html|subject.html|thread.html)') #' + +# print the header +OUTPUT=' +' + +# process all URLs +IFS=" +" + +CURDATE=$(date +%Y-%B) + +process_all() { + for URL in $URLS; do + FREQUENCY=$1 + DATE=${URL%% *} + FILENAME=${URL#* } + if [ $FREQUENCY == daily ] ; then + # if not current month, update monthly anyway + echo $FILENAME | grep -q $CURDATE + if [ $? -eq 0 ] ; then + FREQ=daily + PRIO=1.0 + else + FREQ=monthly + PRIO=0.3 + fi + elif [ $FREQUENCY == monthly ] ; then + FREQ=monthly + PRIO=0.2 + fi +echo " + $FILENAME + $DATE + $FREQ + $PRIO + " + done +} + +# process the URLs +# daily +URLS="$DAILYLIST" +OUTPUT="$OUTPUT +$(process_all daily)" + +# monthly +URLS="$MONTHLYLIST" +OUTPUT="$OUTPUT +$(process_all monthly)" + +# close the +OUTPUT="$OUTPUT +" + +echo "$OUTPUT" | sed -e "s#$MAILMANPATH#$SITEURL#g" | gzip -9 -c >$XMLSITEMAP.tmp +mv $XMLSITEMAP.tmp $XMLSITEMAP -- cgit v1.2.3