# Copyright (C) 1998-2018 by the Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. """Recognizes simple heuristically delimited bounces.""" import re import email.Iterators def _c(pattern): return re.compile(pattern, re.IGNORECASE) # Pattern to match any valid email address and not much more. VALID = _c(r'^[\x21-\x3d\x3f\x41-\x7e]+@[a-z0-9._]+$') # This is a list of tuples of the form # # (start cre, end cre, address cre) # # where `cre' means compiled regular expression, start is the line just before # the bouncing address block, end is the line just after the bouncing address # block, and address cre is the regexp that will recognize the addresses. It # must have a group called `addr' which will contain exactly and only the # address that bounced. PATTERNS = [ # sdm.de (_c('here is your list of failed recipients'), _c('here is your returned mail'), _c(r'<(?P[^>]*)>')), # sz-sb.de, corridor.com, nfg.nl (_c('the following addresses had'), _c('transcript of session follows'), _c(r'^ *(\(expanded from: )?[^\s@]+@[^\s@>]+?)>?\)?\s*$')), # robanal.demon.co.uk (_c('this message was created automatically by mail delivery software'), _c('original message follows'), _c('rcpt to:\s*<(?P[^>]*)>')), # s1.com (InterScan E-Mail VirusWall NT ???) (_c('message from interscan e-mail viruswall nt'), _c('end of message'), _c('rcpt to:\s*<(?P[^>]*)>')), # Smail (_c('failed addresses follow:'), _c('message text follows:'), _c(r'\s*(?P\S+@\S+)')), # newmail.ru (_c('This is the machine generated message from mail service.'), _c('--- Below the next line is a copy of the message.'), _c('<(?P[^>]*)>')), # turbosport.com runs something called `MDaemon 3.5.2' ??? (_c('The following addresses did NOT receive a copy of your message:'), _c('--- Session Transcript ---'), _c('[>]\s*(?P.*)$')), # usa.net (_c('Intended recipient:\s*(?P.*)$'), _c('--------RETURNED MAIL FOLLOWS--------'), _c('Intended recipient:\s*(?P.*)$')), # hotpop.com (_c('Undeliverable Address:\s*(?P.*)$'), _c('Original message attached'), _c('Undeliverable Address:\s*(?P.*)$')), # Another demon.co.uk format (_c('This message was created automatically by mail delivery'), _c('^---- START OF RETURNED MESSAGE ----'), _c("addressed to '(?P[^']*)'")), # Prodigy.net full mailbox (_c("User's mailbox is full:"), _c('Unable to deliver mail.'), _c("User's mailbox is full:\s*<(?P[^>]*)>")), # Microsoft SMTPSVC (_c('The email below could not be delivered to the following user:'), _c('Old message:'), _c('<(?P[^>]*)>')), # Yahoo on behalf of other domains like sbcglobal.net (_c('Unable to deliver message to the following address\(es\)\.'), _c('--- Original message follows\.'), _c('<(?P[^>]*)>:')), # googlemail.com (_c('Delivery to the following recipient(s)? failed'), _c('----- Original message -----'), _c('^\s*(?P[^\s@]+@[^\s@]+)\s*$')), # kundenserver.de, mxlogic.net (_c('A message that you( have)? sent could not be delivered'), _c('^---'), _c('<(?P[^>]*)>')), # another kundenserver.de (_c('A message that you( have)? sent could not be delivered'), _c('^---'), _c('^(?P[^\s@]+@[^\s@:]+):')), # thehartford.com and amenworld.com (_c('Del(i|e)very to the following recipient(s)? (failed|was aborted)'), # this one may or may not have the original message, but there's nothing # unique to stop on, so stop on the first line of at least 3 characters # that doesn't start with 'D' (to not stop immediately) and has no '@'. _c('^[^D][^@]{2,}$'), _c('^\s*(. )?(?P[^\s@]+@[^\s@]+)\s*$')), # and another thehartfod.com/hartfordlife.com (_c('^Your message\s*$'), _c('^because:'), _c('^\s*(?P[^\s@]+@[^\s@]+)\s*$')), # kviv.be (InterScan NT) (_c('^Unable to deliver message to'), _c(r'\*+\s+End of message\s+\*+'), _c('<(?P[^>]*)>')), # earthlink.net supported domains (_c('^Sorry, unable to deliver your message to'), _c('^A copy of the original message'), _c('\s*(?P[^\s@]+@[^\s@]+)\s+')), # ademe.fr (_c('^A message could not be delivered to:'), _c('^Subject:'), _c('^\s*(?P[^\s@]+@[^\s@]+)\s*$')), # andrew.ac.jp (_c('^Invalid final delivery userid:'), _c('^Original message follows.'), _c('\s*(?P[^\s@]+@[^\s@]+)\s*$')), # E500_SMTP_Mail_Service@lerctr.org and similar (_c('---- Failed Recipients ----'), _c(' Mail ----'), _c('<(?P[^>]*)>')), # cynergycom.net (_c('A message that you sent could not be delivered'), _c('^---'), _c('(?P[^\s@]+@[^\s@)]+)')), # LSMTP for Windows (_c('^--> Error description:\s*$'), _c('^Error-End:'), _c('^Error-for:\s+(?P[^\s@]+@[^\s@]+)')), # Qmail with a tri-language intro beginning in spanish (_c('Your message could not be delivered'), _c('^-'), _c('<(?P[^>]*)>:')), # socgen.com (_c('Your message could not be delivered to'), _c('^\s*$'), _c('(?P[^\s@]+@[^\s@]+)')), # dadoservice.it (_c('Your message has encountered delivery problems'), _c('Your message reads'), _c('addressed to\s*(?P[^\s@]+@[^\s@)]+)')), # gomaps.com (_c('Did not reach the following recipient'), _c('^\s*$'), _c('\s(?P[^\s@]+@[^\s@]+)')), # EYOU MTA SYSTEM (_c('This is the deliver program at'), _c('^-'), _c('^(?P[^\s@]+@[^\s@<>]+)')), # A non-standard qmail at ieo.it (_c('this is the email server at'), _c('^-'), _c('\s(?P[^\s@]+@[^\s@]+)[\s,]')), # pla.net.py (MDaemon.PRO ?) (_c('- no such user here'), _c('There is no user'), _c('^(?P[^\s@]+@[^\s@]+)\s')), # fastdnsservers.com (_c('The following recipient.*could not be reached'), _c('bogus stop pattern'), _c('^(?P[^\s@]+@[^\s@]+)\s*$')), # lttf.com (_c('Could not deliver message to'), _c('^\s*--'), _c('^Failed Recipient:\s*(?P[^\s@]+@[^\s@]+)\s*$')), # uci.edu (_c('--------Message not delivered'), _c('--------Error Detail'), _c('^\s*(?P[^\s@]+@[^\s@]+)\s*$')), # Dovecot LDA Over quota MDN (bogus - should be DSN). (_c('^Your message'), _c('^Reporting'), _c( 'Your message to (?P[^\s@]+@[^\s@]+) was automatically rejected' )), # mail.ru (_c('A message that you sent was rejected'), _c('This is a copy of your message'), _c('\s(?P[^\s@]+@[^\s@]+)')), # MailEnable (_c('Message could not be delivered to some recipients.'), _c('Message headers follow'), _c('Recipient: \[SMTP:(?P[^\s@]+@[^\s@]+)\]')), # Next one goes here... ] def process(msg, patterns=None): if patterns is None: patterns = PATTERNS # simple state machine # 0 = nothing seen yet # 1 = intro seen addrs = {} # MAS: This is a mess. The outer loop used to be over the message # so we only looped through the message once. Looping through the # message for each set of patterns is obviously way more work, but # if we don't do it, problems arise because scre from the wrong # pattern set matches first and then acre doesn't match. The # alternative is to split things into separate modules, but then # we process the message multiple times anyway. for scre, ecre, acre in patterns: state = 0 for line in email.Iterators.body_line_iterator(msg, decode=True): if state == 0: if scre.search(line): state = 1 if state == 1: mo = acre.search(line) if mo: addr = mo.group('addr') if addr: addrs[addr.strip('<>')] = 1 elif ecre.search(line): break if addrs: break return [x for x in addrs.keys() if VALID.match(x)]