path: root/Mailman/Archiver/pipermail.py



#! /usr/bin/env python

from __future__ import nested_scopes

import mailbox
import os
import re
import sys
import time
from email.Utils import parseaddr, parsedate_tz, mktime_tz, formatdate
import cPickle as pickle
from cStringIO import StringIO
from string import lowercase

__version__ = '0.09 (Mailman edition)'
VERSION = __version__
CACHESIZE = 100    # Number of slots in the cache

from Mailman import Errors
from Mailman.Mailbox import ArchiverMailbox
from Mailman.Logging.Syslog import syslog
from Mailman.i18n import _

SPACE = ' '


msgid_pat = re.compile(r'(<.*>)')
def strip_separators(s):
    "Remove quotes or parenthesization from a Message-ID string"
    if not s:
        return ""
    if s[0] in '"<([' and s[-1] in '">)]':
        s = s[1:-1]
    return s

smallNameParts = ['van', 'von', 'der', 'de']

def fixAuthor(author):
    "Canonicalize a name into Last, First format"
    # If there's a comma, guess that it's already in "Last, First" format
    if ',' in author:
        return author
    L = author.split()
    i = len(L) - 1
    if i == 0:
        return author # The string's one word--forget it
    if author.upper() == author or author.lower() == author:
        # Damn, the name is all upper- or lower-case.
        while i > 0 and L[i-1].lower() in smallNameParts:
            i = i - 1
    else:
        # Mixed case; assume that small parts of the last name will be
        # in lowercase, and check them against the list.
        while i>0 and (L[i-1][0] in lowercase or
                       L[i-1].lower() in smallNameParts):
            i = i - 1
    author = SPACE.join(L[-1:] + L[i:-1]) + ', ' + SPACE.join(L[:i])
    return author

# Abstract class for databases

class DatabaseInterface:
    def __init__(self): pass
    def close(self): pass
    def getArticle(self, archive, msgid): pass
    def hasArticle(self, archive, msgid): pass
    def addArticle(self, archive, article, subject=None, author=None,
                   date=None): pass
    def firstdate(self, archive): pass
    def lastdate(self, archive): pass
    def first(self, archive, index): pass
    def next(self, archive, index): pass
    def numArticles(self, archive): pass
    def newArchive(self, archive): pass
    def setThreadKey(self, archive, key, msgid): pass
    def getOldestArticle(self, subject): pass

class Database(DatabaseInterface):
    """Define the basic sorting logic for a database

    Assumes that the database internally uses dateIndex, authorIndex,
    etc.
    """

    # TBD Factor out more of the logic shared between BSDDBDatabase
    # and HyperDatabase and place it in this class.

    def __init__(self):
        # This method need not be called by subclasses that do their
        # own initialization.
        self.dateIndex = {}
        self.authorIndex = {}
        self.subjectIndex = {}
        self.articleIndex = {}
        self.changed = {}

    def addArticle(self, archive, article, subject=None, author=None,
                   date=None):
        # create the keys; always end w/ msgid which will be unique
        authorkey = (author or article.author, article.date,
                     article.msgid)
        subjectkey = (subject or article.subject, article.date,
                      article.msgid)
        datekey = date or article.date, article.msgid

        # Add the new article
        self.dateIndex[datekey] = article.msgid
        self.authorIndex[authorkey] = article.msgid
        self.subjectIndex[subjectkey] = article.msgid

        self.store_article(article)
        self.changed[archive, article.msgid] = None

        parentID = article.parentID
        if parentID is not None and self.articleIndex.has_key(parentID):
            parent = self.getArticle(archive, parentID)
            myThreadKey = parent.threadKey + article.date + '-'
        else:
            myThreadKey = article.date + '-'
        article.threadKey = myThreadKey
        key = myThreadKey, article.msgid
        self.setThreadKey(archive, key, article.msgid)

    def store_article(self, article):
        """Store article without message body to save space"""
        # TBD this is not thread safe!
        temp = article.body
        temp2 = article.html_body
        article.body = []
        del article.html_body
        self.articleIndex[article.msgid] = pickle.dumps(article)
        article.body = temp
        article.html_body = temp2


# The Article class encapsulates a single posting.  The attributes
# are:
#
# sequence   : Sequence number, unique for each article in a set of archives
# subject    : Subject
# datestr    : The posting date, in human-readable format
# date       : The posting date, in purely numeric format
# headers    : Any other headers of interest
# author     : The author's name (and possibly organization)
# email      : The author's e-mail address
# msgid      : A unique message ID
# in_reply_to: If != "", this is the msgid of the article being replied to
# references : A (possibly empty) list of msgid's of earlier articles
#              in the thread
# body       : A list of strings making up the message body

class Article:
    _last_article_time = time.time()

    def __init__(self, message = None, sequence = 0, keepHeaders = []):
        if message is None:
            return
        self.sequence = sequence

        self.parentID = None
        self.threadKey = None
        # otherwise the current sequence number is used.
        id = strip_separators(message['Message-Id'])
        if id == "":
            self.msgid = str(self.sequence)
        else: self.msgid = id

        if message.has_key('Subject'):
            self.subject = str(message['Subject'])
        else:
            self.subject = _('No subject')
        if self.subject == "": self.subject = _('No subject')

        self._set_date(message)

        # Figure out the e-mail address and poster's name.  Use the From:
        # field first, followed by Reply-To:
        self.author, self.email = parseaddr(message.get('From', ''))
        e = message['Reply-To']
        if not self.email and e is not None:
            ignoreauthor, self.email = parseaddr(e)
        self.email = strip_separators(self.email)
        self.author = strip_separators(self.author)

        if self.author == "":
            self.author = self.email

        # Save the In-Reply-To:, References:, and Message-ID: lines
        #
        # TBD: The original code does some munging on these fields, which
        # shouldn't be necessary, but changing this may break code.  For
        # safety, I save the original headers on different attributes for use
        # in writing the plain text periodic flat files.
        self._in_reply_to = message['in-reply-to']
        self._references = message['references']
        self._message_id = message['message-id']

        i_r_t = message['In-Reply-To']
        if i_r_t is None:
            self.in_reply_to = ''
        else:
            match = msgid_pat.search(i_r_t)
            if match is None: self.in_reply_to = ''
            else: self.in_reply_to = strip_separators(match.group(1))

        references = message['References']
        if references is None:
            self.references = []
        else:
            self.references = map(strip_separators, references.split())

        # Save any other interesting headers
        self.headers = {}
        for i in keepHeaders:
            if message.has_key(i):
                self.headers[i] = message[i]

        # Read the message body
        # Check Scrubber-munged paylaod
        if message.get('x-mailman-scrubbed'):
            decode = False
        else:
            decode = True
        s = StringIO(message.get_payload(decode=decode)\
                     or message.as_string().split('\n\n',1)[1])
        self.body = s.readlines()

    def _set_date(self, message):
        def floatdate(header):
            missing = []
            datestr = message.get(header, missing)
            if datestr is missing:
                return None
            date = parsedate_tz(datestr)
            try:
                return mktime_tz(date)
            except (TypeError, ValueError, OverflowError):
                return None
        date = floatdate('date')
        if date is None:
            date = floatdate('x-list-received-date')
        if date is None:
            # What's left to try?
            date = self._last_article_time + 1
        self._last_article_time = date
        self.date = '%011i' % date
        self.datestr = message.get('date') \
                       or message.get('x-list-received-date') \
                       or formatdate(date)

    def __repr__(self):
        return '<Article ID = '+repr(self.msgid)+'>'

    def finished_update_article(self):
        pass    

# Pipermail formatter class

class T:
    DIRMODE = 0755      # Mode to give to created directories
    FILEMODE = 0644     # Mode to give to created files
    INDEX_EXT = ".html" # Extension for indexes

    def __init__(self, basedir = None, reload = 1, database = None):
        # If basedir isn't provided, assume the current directory
        if basedir is None:
            self.basedir = os.getcwd()
        else:
            basedir = os.path.expanduser(basedir)
            self.basedir = basedir
        self.database = database

        # If the directory doesn't exist, create it.  This code shouldn't get
        # run anymore, we create the directory in Archiver.py.  It should only
        # get used by legacy lists created that are only receiving their first
        # message in the HTML archive now -- Marc
        try:
            os.stat(self.basedir)
        except os.error, errdata:
            errno, errmsg = errdata
            if errno != 2:
                raise os.error, errdata
            else:
                self.message(_('Creating archive directory ') + self.basedir)
                omask = os.umask(0)
                try:
                    os.mkdir(self.basedir, self.DIRMODE)
                finally:
                    os.umask(omask)

        # Try to load previously pickled state
        try:
            if not reload:
                raise IOError
            f = open(os.path.join(self.basedir, 'pipermail.pck'), 'r')
            self.message(_('Reloading pickled archive state'))
            d = pickle.load(f)
            f.close()
            for key, value in d.items():
                setattr(self, key, value)
        except (IOError, EOFError):
            # No pickled version, so initialize various attributes
            self.archives = []        # Archives
            self._dirty_archives = [] # Archives that will have to be updated
            self.sequence = 0         # Sequence variable used for
                                      #   numbering articles
            self.update_TOC = 0       # Does the TOC need updating?
        #
        # make the basedir variable work when passed in as an __init__ arg
        # and different from the one in the pickle.  Let the one passed in
        # as an __init__ arg take precedence if it's stated.  This way, an
        # archive can be moved from one place to another and still work.
        #
        if basedir != self.basedir:
            self.basedir = basedir

    def close(self):
        "Close an archive, save its state, and update any changed archives."
        self.update_dirty_archives()
        self.update_TOC = 0
        self.write_TOC()
        # Save the collective state
        self.message(_('Pickling archive state into ')
                     + os.path.join(self.basedir, 'pipermail.pck'))
        self.database.close()
        del self.database

        omask = os.umask(007)
        try:
            f = open(os.path.join(self.basedir, 'pipermail.pck'), 'w')
        finally:
            os.umask(omask)
        pickle.dump(self.getstate(), f)
        f.close()

    def getstate(self):
        # can override this in subclass
        return self.__dict__

    #
    # Private methods
    #
    # These will be neither overridden nor called by custom archivers.
    #


    # Create a dictionary of various parameters that will be passed
    # to the write_index_{header,footer} functions
    def __set_parameters(self, archive):
        # Determine the earliest and latest date in the archive
        firstdate = self.database.firstdate(archive)
        lastdate = self.database.lastdate(archive)

        # Get the current time
        now = time.asctime(time.localtime(time.time()))
        self.firstdate = firstdate
        self.lastdate = lastdate
        self.archivedate = now
        self.size = self.database.numArticles(archive)
        self.archive = archive
        self.version = __version__

    # Find the message ID of an article's parent, or return None
    # if no parent can be found.

    def __findParent(self, article, children = []):
            parentID = None
            if article.in_reply_to:
                parentID = article.in_reply_to
            elif article.references:
                # Remove article IDs that aren't in the archive
                refs = filter(self.articleIndex.has_key, article.references)
                if not refs:
                    return None
                maxdate = self.database.getArticle(self.archive,
                                                   refs[0])
                for ref in refs[1:]:
                    a = self.database.getArticle(self.archive, ref)
                    if a.date > maxdate.date:
                        maxdate = a
                parentID = maxdate.msgid
            else:
                # Look for the oldest matching subject
                try:
                    key, tempid = \
                         self.subjectIndex.set_location(article.subject)
                    print key, tempid
                    self.subjectIndex.next()
                    [subject, date] = key.split('\0')
                    print article.subject, subject, date
                    if subject == article.subject and tempid not in children:
                        parentID = tempid
                except KeyError:
                    pass
            return parentID

    # Update the threaded index completely
    def updateThreadedIndex(self):
        # Erase the threaded index
        self.database.clearIndex(self.archive, 'thread')

        # Loop over all the articles
        msgid = self.database.first(self.archive, 'date')
        while msgid is not None:
            try:
                article = self.database.getArticle(self.archive, msgid)
            except KeyError:
                pass
            else:
                if article.parentID is None or \
                   not self.database.hasArticle(self.archive,
                                                article.parentID):
                    # then
                    pass
                else:
                    parent = self.database.getArticle(self.archive,
                                                    article.parentID)
                    article.threadKey = parent.threadKey+article.date+'-'
                self.database.setThreadKey(self.archive,
                    (article.threadKey, article.msgid),
                    msgid)
            msgid = self.database.next(self.archive, 'date')

    #
    # Public methods:
    #
    # These are part of the public interface of the T class, but will
    # never be overridden (unless you're trying to do something very new).

    # Update a single archive's indices, whether the archive's been
    # dirtied or not.
    def update_archive(self, archive):
        self.archive = archive
        self.message(_("Updating index files for archive [%(archive)s]"))
        arcdir = os.path.join(self.basedir, archive)
        self.__set_parameters(archive)

        for hdr in ('Date', 'Subject', 'Author'):
            self._update_simple_index(hdr, archive, arcdir)

        self._update_thread_index(archive, arcdir)

    def _update_simple_index(self, hdr, archive, arcdir):
        self.message("  " + hdr)
        self.type = hdr
        hdr = hdr.lower()

        self._open_index_file_as_stdout(arcdir, hdr)
        self.write_index_header()
        count = 0
        # Loop over the index entries
        msgid = self.database.first(archive, hdr)
        while msgid is not None:
            try:
                article = self.database.getArticle(self.archive, msgid)
            except KeyError:
                pass
            else:
                count = count + 1
                self.write_index_entry(article)
            msgid = self.database.next(archive, hdr)
        # Finish up this index
        self.write_index_footer()
        self._restore_stdout()

    def _update_thread_index(self, archive, arcdir):
        self.message(_("  Thread"))
        self._open_index_file_as_stdout(arcdir, "thread")
        self.type = 'Thread'
        self.write_index_header()

        # To handle the prev./next in thread pointers, we need to
        # track articles 5 at a time.

        # Get the first 5 articles
        L = [None] * 5
        i = 2
        msgid = self.database.first(self.archive, 'thread')

        while msgid is not None and i < 5:
            L[i] = self.database.getArticle(self.archive, msgid)
            i = i + 1
            msgid = self.database.next(self.archive, 'thread')

        while L[2] is not None:
            article = L[2]
            artkey = None
            if article is not None:
                artkey = article.threadKey
            if artkey is not None:
                self.write_threadindex_entry(article, artkey.count('-') - 1)
                if self.database.changed.has_key((archive,article.msgid)):
                    a1 = L[1]
                    a3 = L[3]
                    self.update_article(arcdir, article, a1, a3)
                    if a3 is not None:
                        self.database.changed[(archive, a3.msgid)] = None
                    if a1 is not None:
                        key = archive, a1.msgid
                        if not self.database.changed.has_key(key):
                            self.update_article(arcdir, a1, L[0], L[2])
                        else:
                            del self.database.changed[key]
            if L[0]:
                L[0].finished_update_article()
            L = L[1:]                   # Rotate the list
            if msgid is None:
                L.append(msgid)
            else:
                L.append(self.database.getArticle(self.archive, msgid))
            msgid = self.database.next(self.archive, 'thread')

        self.write_index_footer()
        self._restore_stdout()

    def _open_index_file_as_stdout(self, arcdir, index_name):
        path = os.path.join(arcdir, index_name + self.INDEX_EXT)
        omask = os.umask(002)
        try:
            self.__f = open(path, 'w')
        finally:
            os.umask(omask)
        self.__stdout = sys.stdout
        sys.stdout = self.__f

    def _restore_stdout(self):
        sys.stdout = self.__stdout
        self.__f.close()
        del self.__f
        del self.__stdout

    # Update only archives that have been marked as "changed".
    def update_dirty_archives(self):
        for i in self._dirty_archives:
            self.update_archive(i)
        self._dirty_archives = []

    # Read a Unix mailbox file from the file object <input>,
    # and create a series of Article objects.  Each article
    # object will then be archived.

    def _makeArticle(self, msg, sequence):
        return Article(msg, sequence)

    def processUnixMailbox(self, input, start=None, end=None):
        mbox = ArchiverMailbox(input, self.maillist)
        if start is None:
            start = 0
        counter = 0
        while counter < start:
            try:
                m = mbox.next()
            except Errors.DiscardMessage:
                continue
            if m is None:
                return
            counter += 1
        while 1:
            try:
                pos = input.tell()
                m = mbox.next()
            except Errors.DiscardMessage:
                continue
            except Exception:
                syslog('error', 'uncaught archiver exception at filepos: %s',
                       pos)
                raise
            if m is None:
                break
            if m == '':
                # It was an unparseable message
                continue
            msgid = m.get('message-id', 'n/a')
            self.message(_('#%(counter)05d %(msgid)s'))
            a = self._makeArticle(m, self.sequence)
            self.sequence += 1
            self.add_article(a)
            if end is not None and counter >= end:
               break
            counter += 1

    def new_archive(self, archive, archivedir):
        self.archives.append(archive)
        self.update_TOC = 1
        self.database.newArchive(archive)
        # If the archive directory doesn't exist, create it
        try:
            os.stat(archivedir)
        except os.error, errdata:
            errno, errmsg = errdata
            if errno == 2:
                omask = os.umask(0)
                try:
                    os.mkdir(archivedir, self.DIRMODE)
                finally:
                    os.umask(omask)
            else:
                raise os.error, errdata
        self.open_new_archive(archive, archivedir)

    def add_article(self, article):
        archives = self.get_archives(article)
        if not archives:
            return
        if type(archives) == type(''):
            archives = [archives]

        article.filename = filename = self.get_filename(article)
        temp = self.format_article(article)
        for arch in archives:
            self.archive = arch # why do this???
            archivedir = os.path.join(self.basedir, arch)
            if arch not in self.archives:
                self.new_archive(arch, archivedir)

            # Write the HTML-ized article
            self.write_article(arch, temp, os.path.join(archivedir,
                                                        filename))

            if article.decoded.has_key('author'):
                author = fixAuthor(article.decoded['author'])
            else:
                author = fixAuthor(article.author)
            if article.decoded.has_key('stripped'):
                subject = article.decoded['stripped'].lower()
            else:
                subject = article.subject.lower()

            article.parentID = parentID = self.get_parent_info(arch, article)
            if parentID:
                parent = self.database.getArticle(arch, parentID)
                article.threadKey = parent.threadKey + article.date + '-'
            else:
                article.threadKey = article.date + '-'
            key = article.threadKey, article.msgid

            self.database.setThreadKey(arch, key, article.msgid)
            self.database.addArticle(arch, temp, author=author,
                                     subject=subject)

            if arch not in self._dirty_archives:
                self._dirty_archives.append(arch)

    def get_parent_info(self, archive, article):
        parentID = None
        if article.in_reply_to:
            parentID = article.in_reply_to
        elif article.references:
            refs = self._remove_external_references(article.references)
            if refs:
                maxdate = self.database.getArticle(archive, refs[0])
                for ref in refs[1:]:
                    a = self.database.getArticle(archive, ref)
                    if a.date > maxdate.date:
                        maxdate = a
                parentID = maxdate.msgid
        else:
            # Get the oldest article with a matching subject, and
            # assume this is a follow-up to that article
            parentID = self.database.getOldestArticle(archive,
                                                      article.subject)

        if parentID and not self.database.hasArticle(archive, parentID):
            parentID = None
        return parentID

    def write_article(self, index, article, path):
        omask = os.umask(002)
        try:
            f = open(path, 'w')
        finally:
            os.umask(omask)
        temp_stdout, sys.stdout = sys.stdout, f
        self.write_article_header(article)
        sys.stdout.writelines(article.body)
        self.write_article_footer(article)
        sys.stdout = temp_stdout
        f.close()

    def _remove_external_references(self, refs):
        keep = []
        for ref in refs:
            if self.database.hasArticle(self.archive, ref):
                keep.append(ref)
        return keep

    # Abstract methods: these will need to be overridden by subclasses
    # before anything useful can be done.

    def get_filename(self, article):
        pass
    def get_archives(self, article):
        """Return a list of indexes where the article should be filed.
        A string can be returned if the list only contains one entry,
        and the empty list is legal."""
        pass
    def format_article(self, article):
        pass
    def write_index_header(self):
        pass
    def write_index_footer(self):
        pass
    def write_index_entry(self, article):
        pass
    def write_threadindex_entry(self, article, depth):
        pass
    def write_article_header(self, article):
        pass
    def write_article_footer(self, article):
        pass
    def write_article_entry(self, article):
        pass
    def update_article(self, archivedir, article, prev, next):
        pass
    def write_TOC(self):
        pass
    def open_new_archive(self, archive, dir):
        pass
    def message(self, msg):
        pass


class BSDDBdatabase(Database):
    __super_addArticle = Database.addArticle

    def __init__(self, basedir):
        self.__cachekeys = []
        self.__cachedict = {}
        self.__currentOpenArchive = None # The currently open indices
        self.basedir = os.path.expanduser(basedir)
        self.changed = {} # Recently added articles, indexed only by
                          # message ID

    def firstdate(self, archive):
        self.__openIndices(archive)
        date = 'None'
        try:
            date, msgid = self.dateIndex.first()
            date = time.asctime(time.localtime(float(date)))
        except KeyError:
            pass
        return date

    def lastdate(self, archive):
        self.__openIndices(archive)
        date = 'None'
        try:
            date, msgid = self.dateIndex.last()
            date = time.asctime(time.localtime(float(date)))
        except KeyError:
            pass
        return date

    def numArticles(self, archive):
        self.__openIndices(archive)
        return len(self.dateIndex)

    def addArticle(self, archive, article, subject=None, author=None,
                   date=None):
        self.__openIndices(archive)
        self.__super_addArticle(archive, article, subject, author, date)

    # Open the BSDDB files that are being used as indices
    # (dateIndex, authorIndex, subjectIndex, articleIndex)
    def __openIndices(self, archive):
        if self.__currentOpenArchive == archive:
            return

        import bsddb
        self.__closeIndices()
        arcdir = os.path.join(self.basedir, 'database')
        omask = os.umask(0)
        try:
            try:
                os.mkdir(arcdir, 02775)
            except OSError:
                # BAW: Hmm...
                pass
        finally:
            os.umask(omask)
        for hdr in ('date', 'author', 'subject', 'article', 'thread'):
            path = os.path.join(arcdir, archive + '-' + hdr)
            t = bsddb.btopen(path, 'c')
            setattr(self, hdr + 'Index', t)
        self.__currentOpenArchive = archive

    # Close the BSDDB files that are being used as indices (if they're
    # open--this is safe to call if they're already closed)
    def __closeIndices(self):
        if self.__currentOpenArchive is not None:
            pass
        for hdr in ('date', 'author', 'subject', 'thread', 'article'):
            attr = hdr + 'Index'
            if hasattr(self, attr):
                index = getattr(self, attr)
                if hdr == 'article':
                    if not hasattr(self, 'archive_length'):
                        self.archive_length = {}
                    self.archive_length[self.__currentOpenArchive] = len(index)
                index.close()
                delattr(self,attr)
        self.__currentOpenArchive = None

    def close(self):
        self.__closeIndices()
    def hasArticle(self, archive, msgid):
        self.__openIndices(archive)
        return self.articleIndex.has_key(msgid)
    def setThreadKey(self, archive, key, msgid):
        self.__openIndices(archive)
        self.threadIndex[key] = msgid
    def getArticle(self, archive, msgid):
        self.__openIndices(archive)
        if self.__cachedict.has_key(msgid):
            self.__cachekeys.remove(msgid)
            self.__cachekeys.append(msgid)
            return self.__cachedict[msgid]
        if len(self.__cachekeys) == CACHESIZE:
            delkey, self.__cachekeys = (self.__cachekeys[0],
                                        self.__cachekeys[1:])
            del self.__cachedict[delkey]
        s = self.articleIndex[msgid]
        article = pickle.loads(s)
        self.__cachekeys.append(msgid)
        self.__cachedict[msgid] = article
        return article

    def first(self, archive, index):
        self.__openIndices(archive)
        index = getattr(self, index+'Index')
        try:
            key, msgid = index.first()
            return msgid
        except KeyError:
            return None
    def next(self, archive, index):
        self.__openIndices(archive)
        index = getattr(self, index+'Index')
        try:
            key, msgid = index.next()
        except KeyError:
            return None
        else:
            return msgid

    def getOldestArticle(self, archive, subject):
        self.__openIndices(archive)
        subject = subject.lower()
        try:
            key, tempid = self.subjectIndex.set_location(subject)
            self.subjectIndex.next()
            [subject2, date] = key.split('\0')
            if subject != subject2:
                return None
            return tempid
        except KeyError: # XXX what line raises the KeyError?
            return None

    def newArchive(self, archive):
        pass

    def clearIndex(self, archive, index):
        self.__openIndices(archive)
        index = getattr(self, index+'Index')
        finished = 0
        try:
            key, msgid = self.threadIndex.first()
        except KeyError:
            finished = 1
        while not finished:
            del self.threadIndex[key]
            try:
                key, msgid = self.threadIndex.next()
            except KeyError:
                finished = 1
#! /usr/bin/env python

from __future__ import nested_scopes

import mailbox
import os
import re
import sys
import time
from email.Utils import parseaddr, parsedate_tz, mktime_tz, formatdate
import cPickle as pickle
from cStringIO import StringIO
from string import lowercase

__version__ = '0.09 (Mailman edition)'
VERSION = __version__
CACHESIZE = 100    # Number of slots in the cache

from Mailman import Errors
from Mailman.Mailbox import ArchiverMailbox
from Mailman.Logging.Syslog import syslog
from Mailman.i18n import _

SPACE = ' '


msgid_pat = re.compile(r'(<.*>)')
def strip_separators(s):
    "Remove quotes or parenthesization from a Message-ID string"
    if not s:
        return ""
    if s[0] in '"<([' and s[-1] in '">)]':
        s = s[1:-1]
    return s

smallNameParts = ['van', 'von', 'der', 'de']

def fixAuthor(author):
    "Canonicalize a name into Last, First format"
    # If there's a comma, guess that it's already in "Last, First" format
    if ',' in author:
        return author
    L = author.split()
    i = len(L) - 1
    if i == 0:
        return author # The string's one word--forget it
    if author.upper() == author or author.lower() == author:
        # Damn, the name is all upper- or lower-case.
        while i > 0 and L[i-1].lower() in smallNameParts:
            i = i - 1
    else:
        # Mixed case; assume that small parts of the last name will be
        # in lowercase, and check them against the list.
        while i>0 and (L[i-1][0] in lowercase or
                       L[i-1].lower() in smallNameParts):
            i = i - 1
    author = SPACE.join(L[-1:] + L[i:-1]) + ', ' + SPACE.join(L[:i])
    return author

# Abstract class for databases

class DatabaseInterface:
    def __init__(self): pass
    def close(self): pass
    def getArticle(self, archive, msgid): pass
    def hasArticle(self, archive, msgid): pass
    def addArticle(self, archive, article, subject=None, author=None,
                   date=None): pass
    def firstdate(self, archive): pass
    def lastdate(self, archive): pass
    def first(self, archive, index): pass
    def next(self, archive, index): pass
    def numArticles(self, archive): pass
    def newArchive(self, archive): pass
    def setThreadKey(self, archive, key, msgid): pass
    def getOldestArticle(self, subject): pass

class Database(DatabaseInterface):
    """Define the basic sorting logic for a database

    Assumes that the database internally uses dateIndex, authorIndex,
    etc.
    """

    # TBD Factor out more of the logic shared between BSDDBDatabase
    # and HyperDatabase and place it in this class.

    def __init__(self):
        # This method need not be called by subclasses that do their
        # own initialization.
        self.dateIndex = {}
        self.authorIndex = {}
        self.subjectIndex = {}
        self.articleIndex = {}
        self.changed = {}

    def addArticle(self, archive, article, subject=None, author=None,
                   date=None):
        # create the keys; always end w/ msgid which will be unique
        authorkey = (author or article.author, article.date,
                     article.msgid)
        subjectkey = (subject or article.subject, article.date,
                      article.msgid)
        datekey = date or article.date, article.msgid

        # Add the new article
        self.dateIndex[datekey] = article.msgid
        self.authorIndex[authorkey] = article.msgid
        self.subjectIndex[subjectkey] = article.msgid

        self.store_article(article)
        self.changed[archive, article.msgid] = None

        parentID = article.parentID
        if parentID is not None and self.articleIndex.has_key(parentID):
            parent = self.getArticle(archive, parentID)
            myThreadKey = parent.threadKey + article.date + '-'
        else:
            myThreadKey = article.date + '-'
        article.threadKey = myThreadKey
        key = myThreadKey, article.msgid
        self.setThreadKey(archive, key, article.msgid)

    def store_article(self, article):
        """Store article without message body to save space"""
        # TBD this is not thread safe!
        temp = article.body
        temp2 = article.html_body
        article.body = []
        del article.html_body
        self.articleIndex[article.msgid] = pickle.dumps(article)
        article.body = temp
        article.html_body = temp2


# The Article class encapsulates a single posting.  The attributes
# are:
#
# sequence   : Sequence number, unique for each article in a set of archives
# subject    : Subject
# datestr    : The posting date, in human-readable format
# date       : The posting date, in purely numeric format
# headers    : Any other headers of interest
# author     : The author's name (and possibly organization)
# email      : The author's e-mail address
# msgid      : A unique message ID
# in_reply_to: If != "", this is the msgid of the article being replied to
# references : A (possibly empty) list of msgid's of earlier articles
#              in the thread
# body       : A list of strings making up the message body

class Article:
    _last_article_time = time.time()

    def __init__(self, message = None, sequence = 0, keepHeaders = []):
        if message is None:
            return
        self.sequence = sequence

        self.parentID = None
        self.threadKey = None
        # otherwise the current sequence number is used.
        id = strip_separators(message['Message-Id'])
        if id == "":
            self.msgid = str(self.sequence)
        else: self.msgid = id

        if message.has_key('Subject'):
            self.subject = str(message['Subject'])
        else:
            self.subject = _('No subject')
        if self.subject == "": self.subject = _('No subject')

        self._set_date(message)

        # Figure out the e-mail address and poster's name.  Use the From:
        # field first, followed by Reply-To:
        self.author, self.email = parseaddr(message.get('From', ''))
        e = message['Reply-To']
        if not self.email and e is not None:
            ignoreauthor, self.email = parseaddr(e)
        self.email = strip_separators(self.email)
        self.author = strip_separators(self.author)

        if self.author == "":
            self.author = self.email

        # Save the In-Reply-To:, References:, and Message-ID: lines
        #
        # TBD: The original code does some munging on these fields, which
        # shouldn't be necessary, but changing this may break code.  For
        # safety, I save the original headers on different attributes for use
        # in writing the plain text periodic flat files.
        self._in_reply_to = message['in-reply-to']
        self._references = message['references']
        self._message_id = message['message-id']

        i_r_t = message['In-Reply-To']
        if i_r_t is None:
            self.in_reply_to = ''
        else:
            match = msgid_pat.search(i_r_t)
            if match is None: self.in_reply_to = ''
            else: self.in_reply_to = strip_separators(match.group(1))

        references = message['References']
        if references is None:
            self.references = []
        else:
            self.references = map(strip_separators, references.split())

        # Save any other interesting headers
        self.headers = {}
        for i in keepHeaders:
            if message.has_key(i):
                self.headers[i] = message[i]

        # Read the message body
        # Check Scrubber-munged paylaod
        if message.get('x-mailman-scrubbed'):
            decode = False
        else:
            decode = True
        s = StringIO(message.get_payload(decode=decode)\
                     or message.as_string().split('\n\n',1)[1])
        self.body = s.readlines()

    def _set_date(self, message):
        def floatdate(header):
            missing = []
            datestr = message.get(header, missing)
            if datestr is missing:
                return None
            date = parsedate_tz(datestr)
            try:
                return mktime_tz(date)
            except (TypeError, ValueError, OverflowError):
                return None
        date = floatdate('date')
        if date is None:
            date = floatdate('x-list-received-date')
        if date is None:
            # What's left to try?
            date = self._last_article_time + 1
        self._last_article_time = date
        self.date = '%011i' % date
        self.datestr = message.get('date') \
                       or message.get('x-list-received-date') \
                       or formatdate(date)

    def __repr__(self):
        return '<Article ID = '+repr(self.msgid)+'>'

    def finished_update_article(self):
        pass    

# Pipermail formatter class

class T:
    DIRMODE = 0755      # Mode to give to created directories
    FILEMODE = 0644     # Mode to give to created files
    INDEX_EXT = ".html" # Extension for indexes

    def __init__(self, basedir = None, reload = 1, database = None):
        # If basedir isn't provided, assume the current directory
        if basedir is None:
            self.basedir = os.getcwd()
        else:
            basedir = os.path.expanduser(basedir)
            self.basedir = basedir
        self.database = database

        # If the directory doesn't exist, create it.  This code shouldn't get
        # run anymore, we create the directory in Archiver.py.  It should only
        # get used by legacy lists created that are only receiving their first
        # message in the HTML archive now -- Marc
        try:
            os.stat(self.basedir)
        except os.error, errdata:
            errno, errmsg = errdata
            if errno != 2:
                raise os.error, errdata
            else:
                self.message(_('Creating archive directory ') + self.basedir)
                omask = os.umask(0)
                try:
                    os.mkdir(self.basedir, self.DIRMODE)
                finally:
                    os.umask(omask)

        # Try to load previously pickled state
        try:
            if not reload:
                raise IOError
            f = open(os.path.join(self.basedir, 'pipermail.pck'), 'r')
            self.message(_('Reloading pickled archive state'))
            d = pickle.load(f)
            f.close()
            for key, value in d.items():
                setattr(self, key, value)
        except (IOError, EOFError):
            # No pickled version, so initialize various attributes
            self.archives = []        # Archives
            self._dirty_archives = [] # Archives that will have to be updated
            self.sequence = 0         # Sequence variable used for
                                      #   numbering articles
            self.update_TOC = 0       # Does the TOC need updating?
        #
        # make the basedir variable work when passed in as an __init__ arg
        # and different from the one in the pickle.  Let the one passed in
        # as an __init__ arg take precedence if it's stated.  This way, an
        # archive can be moved from one place to another and still work.
        #
        if basedir != self.basedir:
            self.basedir = basedir

    def close(self):
        "Close an archive, save its state, and update any changed archives."
        self.update_dirty_archives()
        self.update_TOC = 0
        self.write_TOC()
        # Save the collective state
        self.message(_('Pickling archive state into ')
                     + os.path.join(self.basedir, 'pipermail.pck'))
        self.database.close()
        del self.database

        omask = os.umask(007)
        try:
            f = open(os.path.join(self.basedir, 'pipermail.pck'), 'w')
        finally:
            os.umask(omask)
        pickle.dump(self.getstate(), f)
        f.close()

    def getstate(self):
        # can override this in subclass
        return self.__dict__

    #
    # Private methods
    #
    # These will be neither overridden nor called by custom archivers.
    #


    # Create a dictionary of various parameters that will be passed
    # to the write_index_{header,footer} functions
    def __set_parameters(self, archive):
        # Determine the earliest and latest date in the archive
        firstdate = self.database.firstdate(archive)
        lastdate = self.database.lastdate(archive)

        # Get the current time
        now = time.asctime(time.localtime(time.time()))
        self.firstdate = firstdate
        self.lastdate = lastdate
        self.archivedate = now
        self.size = self.database.numArticles(archive)
        self.archive = archive
        self.version = __version__

    # Find the message ID of an article's parent, or return None
    # if no parent can be found.

    def __findParent(self, article, children = []):
            parentID = None
            if article.in_reply_to:
                parentID = article.in_reply_to
            elif article.references:
                # Remove article IDs that aren't in the archive
                refs = filter(self.articleIndex.has_key, article.references)
                if not refs:
                    return None
                maxdate = self.database.getArticle(self.archive,
                                                   refs[0])
                for ref in refs[1:]:
                    a = self.database.getArticle(self.archive, ref)
                    if a.date > maxdate.date:
                        maxdate = a
                parentID = maxdate.msgid
            else:
                # Look for the oldest matching subject
                try:
                    key, tempid = \
                         self.subjectIndex.set_location(article.subject)
                    print key, tempid
                    self.subjectIndex.next()
                    [subject, date] = key.split('\0')
                    print article.subject, subject, date
                    if subject == article.subject and tempid not in children:
                        parentID = tempid
                except KeyError:
                    pass
            return parentID

    # Update the threaded index completely
    def updateThreadedIndex(self):
        # Erase the threaded index
        self.database.clearIndex(self.archive, 'thread')

        # Loop over all the articles
        msgid = self.database.first(self.archive, 'date')
        while msgid is not None:
            try:
                article = self.database.getArticle(self.archive, msgid)
            except KeyError:
                pass
            else:
                if article.parentID is None or \
                   not self.database.hasArticle(self.archive,
                                                article.parentID):
                    # then
                    pass
                else:
                    parent = self.database.getArticle(self.archive,
                                                    article.parentID)
                    article.threadKey = parent.threadKey+article.date+'-'
                self.database.setThreadKey(self.archive,
                    (article.threadKey, article.msgid),
                    msgid)
            msgid = self.database.next(self.archive, 'date')

    #
    # Public methods:
    #
    # These are part of the public interface of the T class, but will
    # never be overridden (unless you're trying to do something very new).

    # Update a single archive's indices, whether the archive's been
    # dirtied or not.
    def update_archive(self, archive):
        self.archive = archive
        self.message(_("Updating index files for archive [%(archive)s]"))
        arcdir = os.path.join(self.basedir, archive)
        self.__set_parameters(archive)

        for hdr in ('Date', 'Subject', 'Author'):
            self._update_simple_index(hdr, archive, arcdir)

        self._update_thread_index(archive, arcdir)

    def _update_simple_index(self, hdr, archive, arcdir):
        self.message("  " + hdr)
        self.type = hdr
        hdr = hdr.lower()

        self._open_index_file_as_stdout(arcdir, hdr)
        self.write_index_header()
        count = 0
        # Loop over the index entries
        msgid = self.database.first(archive, hdr)
        while msgid is not None:
            try:
                article = self.database.getArticle(self.archive, msgid)
            except KeyError:
                pass
            else:
                count = count + 1
                self.write_index_entry(article)
            msgid = self.database.next(archive, hdr)
        # Finish up this index
        self.write_index_footer()
        self._restore_stdout()

    def _update_thread_index(self, archive, arcdir):
        self.message(_("  Thread"))
        self._open_index_file_as_stdout(arcdir, "thread")
        self.type = 'Thread'
        self.write_index_header()

        # To handle the prev./next in thread pointers, we need to
        # track articles 5 at a time.

        # Get the first 5 articles
        L = [None] * 5
        i = 2
        msgid = self.database.first(self.archive, 'thread')

        while msgid is not None and i < 5:
            L[i] = self.database.getArticle(self.archive, msgid)
            i = i + 1
            msgid = self.database.next(self.archive, 'thread')

        while L[2] is not None:
            article = L[2]
            artkey = None
            if article is not None:
                artkey = article.threadKey
            if artkey is not None:
                self.write_threadindex_entry(article, artkey.count('-') - 1)
                if self.database.changed.has_key((archive,article.msgid)):
                    a1 = L[1]
                    a3 = L[3]
                    self.update_article(arcdir, article, a1, a3)
                    if a3 is not None:
                        self.database.changed[(archive, a3.msgid)] = None
                    if a1 is not None:
                        key = archive, a1.msgid
                        if not self.database.changed.has_key(key):
                            self.update_article(arcdir, a1, L[0], L[2])
                        else:
                            del self.database.changed[key]
            if L[0]:
                L[0].finished_update_article()
            L = L[1:]                   # Rotate the list
            if msgid is None:
                L.append(msgid)
            else:
                L.append(self.database.getArticle(self.archive, msgid))
            msgid = self.database.next(self.archive, 'thread')

        self.write_index_footer()
        self._restore_stdout()

    def _open_index_file_as_stdout(self, arcdir, index_name):
        path = os.path.join(arcdir, index_name + self.INDEX_EXT)
        omask = os.umask(002)
        try:
            self.__f = open(path, 'w')
        finally:
            os.umask(omask)
        self.__stdout = sys.stdout
        sys.stdout = self.__f

    def _restore_stdout(self):
        sys.stdout = self.__stdout
        self.__f.close()
        del self.__f
        del self.__stdout

    # Update only archives that have been marked as "changed".
    def update_dirty_archives(self):
        for i in self._dirty_archives:
            self.update_archive(i)
        self._dirty_archives = []

    # Read a Unix mailbox file from the file object <input>,
    # and create a series of Article objects.  Each article
    # object will then be archived.

    def _makeArticle(self, msg, sequence):
        return Article(msg, sequence)

    def processUnixMailbox(self, input, start=None, end=None):
        mbox = ArchiverMailbox(input, self.maillist)
        if start is None:
            start = 0
        counter = 0
        while counter < start:
            try:
                m = mbox.next()
            except Errors.DiscardMessage:
                continue
            if m is None:
                return
            counter += 1
        while 1:
            try:
                pos = input.tell()
                m = mbox.next()
            except Errors.DiscardMessage:
                continue
            except Exception:
                syslog('error', 'uncaught archiver exception at filepos: %s',
                       pos)
                raise
            if m is None:
                break
            if m == '':
                # It was an unparseable message
                continue
            msgid = m.get('message-id', 'n/a')
            self.message(_('#%(counter)05d %(msgid)s'))
            a = self._makeArticle(m, self.sequence)
            self.sequence += 1
            self.add_article(a)
            if end is not None and counter >= end:
               break
            counter += 1

    def new_archive(self, archive, archivedir):
        self.archives.append(archive)
        self.update_TOC = 1
        self.database.newArchive(archive)
        # If the archive directory doesn't exist, create it
        try:
            os.stat(archivedir)
        except os.error, errdata:
            errno, errmsg = errdata
            if errno == 2:
                omask = os.umask(0)
                try:
                    os.mkdir(archivedir, self.DIRMODE)
                finally:
                    os.umask(omask)
            else:
                raise os.error, errdata
        self.open_new_archive(archive, archivedir)

    def add_article(self, article):
        archives = self.get_archives(article)
        if not archives:
            return
        if type(archives) == type(''):
            archives = [archives]

        article.filename = filename = self.get_filename(article)
        temp = self.format_article(article)
        for arch in archives:
            self.archive = arch # why do this???
            archivedir = os.path.join(self.basedir, arch)
            if arch not in self.archives:
                self.new_archive(arch, archivedir)

            # Write the HTML-ized article
            self.write_article(arch, temp, os.path.join(archivedir,
                                                        filename))

            if article.decoded.has_key('author'):
                author = fixAuthor(article.decoded['author'])
            else:
                author = fixAuthor(article.author)
            if article.decoded.has_key('stripped'):
                subject = article.decoded['stripped'].lower()
            else:
                subject = article.subject.lower()

            article.parentID = parentID = self.get_parent_info(arch, article)
            if parentID:
                parent = self.database.getArticle(arch, parentID)
                article.threadKey = parent.threadKey + article.date + '-'
            else:
                article.threadKey = article.date + '-'
            key = article.threadKey, article.msgid

            self.database.setThreadKey(arch, key, article.msgid)
            self.database.addArticle(arch, temp, author=author,
                                     subject=subject)

            if arch not in self._dirty_archives:
                self._dirty_archives.append(arch)

    def get_parent_info(self, archive, article):
        parentID = None
        if article.in_reply_to:
            parentID = article.in_reply_to
        elif article.references:
            refs = self._remove_external_references(article.references)
            if refs:
                maxdate = self.database.getArticle(archive, refs[0])
                for ref in refs[1:]:
                    a = self.database.getArticle(archive, ref)
                    if a.date > maxdate.date:
                        maxdate = a
                parentID = maxdate.msgid
        else:
            # Get the oldest article with a matching subject, and
            # assume this is a follow-up to that article
            parentID = self.database.getOldestArticle(archive,
                                                      article.subject)

        if parentID and not self.database.hasArticle(archive, parentID):
            parentID = None
        return parentID

    def write_article(self, index, article, path):
        omask = os.umask(002)
        try:
            f = open(path, 'w')
        finally:
            os.umask(omask)
        temp_stdout, sys.stdout = sys.stdout, f
        self.write_article_header(article)
        sys.stdout.writelines(article.body)
        self.write_article_footer(article)
        sys.stdout = temp_stdout
        f.close()

    def _remove_external_references(self, refs):
        keep = []
        for ref in refs:
            if self.database.hasArticle(self.archive, ref):
                keep.append(ref)
        return keep

    # Abstract methods: these will need to be overridden by subclasses
    # before anything useful can be done.

    def get_filename(self, article):
        pass
    def get_archives(self, article):
        """Return a list of indexes where the article should be filed.
        A string can be returned if the list only contains one entry,
        and the empty list is legal."""
        pass
    def format_article(self, article):
        pass
    def write_index_header(self):
        pass
    def write_index_footer(self):
        pass
    def write_index_entry(self, article):
        pass
    def write_threadindex_entry(self, article, depth):
        pass
    def write_article_header(self, article):
        pass
    def write_article_footer(self, article):
        pass
    def write_article_entry(self, article):
        pass
    def update_article(self, archivedir, article, prev, next):
        pass
    def write_TOC(self):
        pass
    def open_new_archive(self, archive, dir):
        pass
    def message(self, msg):
        pass


class BSDDBdatabase(Database):
    __super_addArticle = Database.addArticle

    def __init__(self, basedir):
        self.__cachekeys = []
        self.__cachedict = {}
        self.__currentOpenArchive = None # The currently open indices
        self.basedir = os.path.expanduser(basedir)
        self.changed = {} # Recently added articles, indexed only by
                          # message ID

    def firstdate(self, archive):
        self.__openIndices(archive)
        date = 'None'
        try:
            date, msgid = self.dateIndex.first()
            date = time.asctime(time.localtime(float(date)))
        except KeyError:
            pass
        return date

    def lastdate(self, archive):
        self.__openIndices(archive)
        date = 'None'
        try:
            date, msgid = self.dateIndex.last()
            date = time.asctime(time.localtime(float(date)))
        except KeyError:
            pass
        return date

    def numArticles(self, archive):
        self.__openIndices(archive)
        return len(self.dateIndex)

    def addArticle(self, archive, article, subject=None, author=None,
                   date=None):
        self.__openIndices(archive)
        self.__super_addArticle(archive, article, subject, author, date)

    # Open the BSDDB files that are being used as indices
    # (dateIndex, authorIndex, subjectIndex, articleIndex)
    def __openIndices(self, archive):
        if self.__currentOpenArchive == archive:
            return

        import bsddb
        self.__closeIndices()
        arcdir = os.path.join(self.basedir, 'database')
        omask = os.umask(0)
        try:
            try:
                os.mkdir(arcdir, 02775)
            except OSError:
                # BAW: Hmm...
                pass
        finally:
            os.umask(omask)
        for hdr in ('date', 'author', 'subject', 'article', 'thread'):
            path = os.path.join(arcdir, archive + '-' + hdr)
            t = bsddb.btopen(path, 'c')
            setattr(self, hdr + 'Index', t)
        self.__currentOpenArchive = archive

    # Close the BSDDB files that are being used as indices (if they're
    # open--this is safe to call if they're already closed)
    def __closeIndices(self):
        if self.__currentOpenArchive is not None:
            pass
        for hdr in ('date', 'author', 'subject', 'thread', 'article'):
            attr = hdr + 'Index'
            if hasattr(self, attr):
                index = getattr(self, attr)
                if hdr == 'article':
                    if not hasattr(self, 'archive_length'):
                        self.archive_length = {}
                    self.archive_length[self.__currentOpenArchive] = len(index)
                index.close()
                delattr(self,attr)
        self.__currentOpenArchive = None

    def close(self):
        self.__closeIndices()
    def hasArticle(self, archive, msgid):
        self.__openIndices(archive)
        return self.articleIndex.has_key(msgid)
    def setThreadKey(self, archive, key, msgid):
        self.__openIndices(archive)
        self.threadIndex[key] = msgid
    def getArticle(self, archive, msgid):
        self.__openIndices(archive)
        if self.__cachedict.has_key(msgid):
            self.__cachekeys.remove(msgid)
            self.__cachekeys.append(msgid)
            return self.__cachedict[msgid]
        if len(self.__cachekeys) == CACHESIZE:
            delkey, self.__cachekeys = (self.__cachekeys[0],
                                        self.__cachekeys[1:])
            del self.__cachedict[delkey]
        s = self.articleIndex[msgid]
        article = pickle.loads(s)
        self.__cachekeys.append(msgid)
        self.__cachedict[msgid] = article
        return article

    def first(self, archive, index):
        self.__openIndices(archive)
        index = getattr(self, index+'Index')
        try:
            key, msgid = index.first()
            return msgid
        except KeyError:
            return None
    def next(self, archive, index):
        self.__openIndices(archive)
        index = getattr(self, index+'Index')
        try:
            key, msgid = index.next()
        except KeyError:
            return None
        else:
            return msgid

    def getOldestArticle(self, archive, subject):
        self.__openIndices(archive)
        subject = subject.lower()
        try:
            key, tempid = self.subjectIndex.set_location(subject)
            self.subjectIndex.next()
            [subject2, date] = key.split('\0')
            if subject != subject2:
                return None
            return tempid
        except KeyError: # XXX what line raises the KeyError?
            return None

    def newArchive(self, archive):
        pass

    def clearIndex(self, archive, index):
        self.__openIndices(archive)
        index = getattr(self, index+'Index')
        finished = 0
        try:
            key, msgid = self.threadIndex.first()
        except KeyError:
            finished = 1
        while not finished:
            del self.threadIndex[key]
            try:
                key, msgid = self.threadIndex.next()
            except KeyError:
                finished = 1