aboutsummaryrefslogblamecommitdiffstats
path: root/contrib/sitemapgen
blob: 21a4ffb222b8313fe0f2fff50ec62e63058de4e6 (plain) (tree)



































































































































































                                                                                                    
#! @PYTHON@

# For a given listname, this script generates sitemap.xml.gz files
# under archives/private/<listname>/
#
# Copyright (C) 1998-2018 by the Free Software Foundation, Inc.
#
# graciously hacked from bin/sync_members
#

"""Build Sitemap files for an archive

Usage: %(program)s [options] listname

Where `options' are:

    --help
    -h
        Print this message.

    listname
        Required.  This specifies the list to generate sitemaps for.
"""

import os
import sys
import paths
# Import this /after/ paths so that the sys.path is properly hacked
import email.Utils
from Mailman import MailList
from Mailman import Errors
from Mailman import Utils
from Mailman.UserDesc import UserDesc
from Mailman import mm_cfg
from Mailman.i18n import _
import getopt
import re
import time
from stat import *
from datetime import datetime, timedelta
import gzip


# sitemap priorities in age-in-weeks/priority/changefreq tuples
priorities = ([1, 1.0, "daily"],
              [4, 1.0, "weekly"],
              [30, 1.0, "monthly"],
              [52, 0.9, "never"],
              [100, 0.8, "never"],
              [200, 0.7, "never"],
              [300, 0.6, "never"],
              [400, 0.5, "never"])


program = sys.argv[0]

def usage(code, msg=''):
    if code:
        fd = sys.stderr
    else:
        fd = sys.stdout
    print >> fd, _(__doc__)
    if msg:
        print >> fd, msg
    sys.exit(code)



def main():
    listname = None

    # TBD: can't use getopt with this command line syntax, which is broken and
    # should be changed to be getopt compatible.
    i = 1
    while i < len(sys.argv):
        opt = sys.argv[i]
        if opt in ('-h', '--help'):
            usage(0)
        else:
            try:
                listname = sys.argv[i].lower()
                i += 1
            except IndexError:
                usage(1, _('No listname given'))
            break

    if listname is None:
        usage(1, _('Must have a listname'))

    # get the locked list object
    try:
        mlist = MailList.MailList(listname, lock=0)
    except Errors.MMListError, e:
        print _('No such list: %(listname)s')
        sys.exit(1)

    rootdir = mlist.archive_dir()
    rooturl = mlist.GetBaseArchiveURL()

    reArcPath = re.compile(r'^\d+')
    reArcFile = re.compile(r'\d+\.html')

    sitemaps = []

    now = datetime.now()

    for folder in os.listdir(rootdir):
        path = os.path.join(rootdir,folder)
        if not os.path.isdir(path) or not reArcPath.search(folder):
            continue
    
        dirtime = os.path.getmtime(path)
    
        os.umask(0022)
        sitemap = os.path.join(rootdir,folder,"sitemap.xml.gz")
        f = gzip.open(sitemap, 'wb')
    
        f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        f.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n')
    
        for file in os.listdir(path):
            if not reArcFile.search(file):
                continue
    
            # get timestamp of file
            st = os.stat(os.path.join(rootdir,folder,file))
            mtime = st[ST_MTIME] #modification time
    
            ts = datetime.fromtimestamp(mtime)
            for weeks, priority, changefreq in priorities:
                if ts > now - timedelta(weeks = weeks):
                    break
    
            f.write('  <url>\n  <loc>' + os.path.join(rooturl,folder,file) + '</loc>\n')
            f.write('    <lastmod>' + time.strftime("%Y-%m-%d",time.gmtime(mtime)) + '</lastmod>\n')
            f.write('    <changefreq>' + changefreq + '</changefreq>\n')
            f.write('    <priority>' + str(priority) + '</priority>\n')
            f.write('  </url>\n')
    
        f.write('</urlset>\n')
        f.close()
    
        sitemaps.append((os.path.join(rooturl,folder,"sitemap.xml.gz")))
    
    
    # write out the sitemapindex file
    sitemapindex = os.path.join(rootdir,"sitemap.xml.gz")
    f = gzip.open(sitemapindex, 'wb')
    
    f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    f.write('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n')
    
    for sitemap in sitemaps:
        f.write('  <sitemap>\n  <loc>' + sitemap + '</loc>\n')
        f.write('    <lastmod>' + time.strftime("%Y-%m-%d", now.timetuple()) + '</lastmod>\n')
        f.write('  </sitemap>\n')
    
    f.write('</sitemapindex>\n')
    f.close()


if __name__ == '__main__':
    main()