3 files changed, 110 insertions, 0 deletions
diff --git a/NEWS b/NEWS
index 84fdebf5..e1bf51a0 100644
--- a/NEWS
+++ b/NEWS
@@ -29,6 +29,12 @@ Here is a history of user visible changes to Mailman.
     - list_lists now has an option to list only lists with public archives.
       (LP: 1082711)
 
+  Contributed programs
+
+    - A new "sitemap" bash script has been contributed by Tomasz Chmielewski
+      <mangoo@wpkg.org> to generate a sitemap.xml file of an installation's
+      public archives for submission to search engines.
+
   i18n
 
     - German message catalog has been updated per Ralf Hildebrandt.
diff --git a/contrib/README.sitemap b/contrib/README.sitemap
new file mode 100644
index 00000000..88932610
--- /dev/null
+++ b/contrib/README.sitemap
@@ -0,0 +1,18 @@
+A bash script from Tomasz Chmielewski <mangoo@wpkg.org>.
+
+If you ever wondered how to generate a sitemap file for Mailman (the GNU
+Mailing List Manager), to be submitted i.e. to Google, here is how.
+
+All you have to do is replace a few variables, and submit the sitemap to
+your favourite search engine.  Consider using the new in 2.1.16 -p or
+--public-archive option to Mailman's bin/list_lists to generate the list of
+lists to process.  I.e.,
+
+LISTS=`/path/to/mailman/bin/list_lists -p -b`
+
+It is based on a real website running Mailman: http://lists.wpkg.org
+
+With sitemap code, it takes around 5 seconds to generate a sitemap consisting
+of 10000 articles.
+
+Run it once a day via cron.
diff --git a/contrib/sitemap b/contrib/sitemap
new file mode 100644
index 00000000..34414de2
--- /dev/null
+++ b/contrib/sitemap
@@ -0,0 +1,86 @@
+#!/bin/bash
+ 
+# Simple sitemap.xml generator for Mailman
+ 
+# URL where your lists sit
+SITEURL=http://lists.wpkg.org/pipermail
+ 
+# path to your mailman archives/private
+MAILMANPATH=/path/to/htdocs/archives/private
+ 
+# lists we want to process
+LISTS="debian-non-standard iodine-users sheepdog sheepdog-users stgt wpkg-announce wpkg-users"
+ 
+# path to the sitemap.xml.gz file (gzipped)
+XMLSITEMAP=/path/to/htdocs/cgi-bin/sitemap.xml.gz
+ 
+# No need to change anything below
+set -u
+ 
+# find html files with their dates
+URLS=""
+for LIST in $LISTS; do
+    URLS="$URLS"$'\n'$(find $MAILMANPATH/$LIST/ -type f -name \*html | xargs ls --time-style=long-iso -l | awk '{print $6"T"$7":00+00:00 "$8}' | grep -v attachments)
+done
+ 
+# if the article is crawled once a month, it should be enough
+MONTHLYLIST=$(echo "$URLS" | egrep -v '(author.html|date.html|index.html|subject.html|thread.html)')
+ 
+# indexes should be crawled daily. We'll set them to monthly later on, if they are old
+DAILYLIST=$(echo "$URLS" | egrep '(author.html|date.html|index.html|subject.html|thread.html)') #'
+ 
+# print the header
+OUTPUT='<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
+ 
+# process all URLs
+IFS="
+"
+ 
+CURDATE=$(date +%Y-%B)
+ 
+process_all() {
+    for URL in $URLS; do
+        FREQUENCY=$1
+        DATE=${URL%% *}
+        FILENAME=${URL#* }
+        if [ $FREQUENCY == daily ] ; then
+            # if not current month, update monthly anyway
+            echo $FILENAME | grep -q $CURDATE
+            if [ $? -eq 0 ] ; then
+                FREQ=daily
+                PRIO=1.0
+            else
+                FREQ=monthly
+                PRIO=0.3
+            fi
+        elif [ $FREQUENCY == monthly ] ; then
+            FREQ=monthly
+            PRIO=0.2
+        fi
+echo "        <url>
+                <loc>$FILENAME</loc>
+                <lastmod>$DATE</lastmod>
+                <changefreq>$FREQ</changefreq>
+                <priority>$PRIO</priority>
+        </url>"
+    done
+}
+ 
+# process the URLs
+# daily
+URLS="$DAILYLIST"
+OUTPUT="$OUTPUT
+$(process_all daily)"
+ 
+# monthly
+URLS="$MONTHLYLIST"
+OUTPUT="$OUTPUT
+$(process_all monthly)"
+ 
+# close the </urlset>
+OUTPUT="$OUTPUT
+</urlset>"
+ 
+echo "$OUTPUT" | sed -e "s#$MAILMANPATH#$SITEURL#g" | gzip -9 -c >$XMLSITEMAP.tmp
+mv $XMLSITEMAP.tmp $XMLSITEMAP