diff options
Diffstat (limited to 'contrib')
-rw-r--r-- | contrib/README.sitemap | 18 | ||||
-rw-r--r-- | contrib/sitemap | 86 |
2 files changed, 104 insertions, 0 deletions
diff --git a/contrib/README.sitemap b/contrib/README.sitemap new file mode 100644 index 00000000..88932610 --- /dev/null +++ b/contrib/README.sitemap @@ -0,0 +1,18 @@ +A bash script from Tomasz Chmielewski <mangoo@wpkg.org>. + +If you ever wondered how to generate a sitemap file for Mailman (the GNU +Mailing List Manager), to be submitted i.e. to Google, here is how. + +All you have to do is replace a few variables, and submit the sitemap to +your favourite search engine. Consider using the new in 2.1.16 -p or +--public-archive option to Mailman's bin/list_lists to generate the list of +lists to process. I.e., + +LISTS=`/path/to/mailman/bin/list_lists -p -b` + +It is based on a real website running Mailman: http://lists.wpkg.org + +With sitemap code, it takes around 5 seconds to generate a sitemap consisting +of 10000 articles. + +Run it once a day via cron. diff --git a/contrib/sitemap b/contrib/sitemap new file mode 100644 index 00000000..34414de2 --- /dev/null +++ b/contrib/sitemap @@ -0,0 +1,86 @@ +#!/bin/bash + +# Simple sitemap.xml generator for Mailman + +# URL where your lists sit +SITEURL=http://lists.wpkg.org/pipermail + +# path to your mailman archives/private +MAILMANPATH=/path/to/htdocs/archives/private + +# lists we want to process +LISTS="debian-non-standard iodine-users sheepdog sheepdog-users stgt wpkg-announce wpkg-users" + +# path to the sitemap.xml.gz file (gzipped) +XMLSITEMAP=/path/to/htdocs/cgi-bin/sitemap.xml.gz + +# No need to change anything below +set -u + +# find html files with their dates +URLS="" +for LIST in $LISTS; do + URLS="$URLS"$'\n'$(find $MAILMANPATH/$LIST/ -type f -name \*html | xargs ls --time-style=long-iso -l | awk '{print $6"T"$7":00+00:00 "$8}' | grep -v attachments) +done + +# if the article is crawled once a month, it should be enough +MONTHLYLIST=$(echo "$URLS" | egrep -v '(author.html|date.html|index.html|subject.html|thread.html)') + +# indexes should be crawled daily. We'll set them to monthly later on, if they are old +DAILYLIST=$(echo "$URLS" | egrep '(author.html|date.html|index.html|subject.html|thread.html)') #' + +# print the header +OUTPUT='<?xml version="1.0" encoding="UTF-8"?> +<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' + +# process all URLs +IFS=" +" + +CURDATE=$(date +%Y-%B) + +process_all() { + for URL in $URLS; do + FREQUENCY=$1 + DATE=${URL%% *} + FILENAME=${URL#* } + if [ $FREQUENCY == daily ] ; then + # if not current month, update monthly anyway + echo $FILENAME | grep -q $CURDATE + if [ $? -eq 0 ] ; then + FREQ=daily + PRIO=1.0 + else + FREQ=monthly + PRIO=0.3 + fi + elif [ $FREQUENCY == monthly ] ; then + FREQ=monthly + PRIO=0.2 + fi +echo " <url> + <loc>$FILENAME</loc> + <lastmod>$DATE</lastmod> + <changefreq>$FREQ</changefreq> + <priority>$PRIO</priority> + </url>" + done +} + +# process the URLs +# daily +URLS="$DAILYLIST" +OUTPUT="$OUTPUT +$(process_all daily)" + +# monthly +URLS="$MONTHLYLIST" +OUTPUT="$OUTPUT +$(process_all monthly)" + +# close the </urlset> +OUTPUT="$OUTPUT +</urlset>" + +echo "$OUTPUT" | sed -e "s#$MAILMANPATH#$SITEURL#g" | gzip -9 -c >$XMLSITEMAP.tmp +mv $XMLSITEMAP.tmp $XMLSITEMAP |