1 # Copyright (C) 2002-2007 Python Software Foundation
   2 # Contact: email-sig@python.org
   3 
   4 """Email address parsing code.
   5 
   6 Lifted directly from rfc822.py.  This should eventually be rewritten.
   7 """
   8 
   9 __all__ = [
  10     'mktime_tz',
  11     'parsedate',
  12     'parsedate_tz',
  13     'quote',
  14     ]
  15 
  16 import time, calendar
  17 
  18 SPACE = ' '
  19 EMPTYSTRING = ''
  20 COMMASPACE = ', '
  21 
  22 # Parse a date field
  23 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
  24                'aug', 'sep', 'oct', 'nov', 'dec',
  25                'january', 'february', 'march', 'april', 'may', 'june', 'july',
  26                'august', 'september', 'october', 'november', 'december']
  27 
  28 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
  29 
  30 # The timezone table does not include the military time zones defined
  31 # in RFC822, other than Z.  According to RFC1123, the description in
  32 # RFC822 gets the signs wrong, so we can't rely on any such time
  33 # zones.  RFC1123 recommends that numeric timezone indicators be used
  34 # instead of timezone names.
  35 
  36 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
  37               'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
  38               'EST': -500, 'EDT': -400,  # Eastern
  39               'CST': -600, 'CDT': -500,  # Central
  40               'MST': -700, 'MDT': -600,  # Mountain
  41               'PST': -800, 'PDT': -700   # Pacific
  42               }
  43 
  44 
  45 def parsedate_tz(data):
  46     """Convert a date string to a time tuple.
  47 
  48     Accounts for military timezones.
  49     """
  50     res = _parsedate_tz(data)
  51     if not res:
  52         return
  53     if res[9] is None:
  54         res[9] = 0
  55     return tuple(res)
  56 
  57 def _parsedate_tz(data):
  58     """Convert date to extended time tuple.
  59 
  60     The last (additional) element is the time zone offset in seconds, except if
  61     the timezone was specified as -0000.  In that case the last element is
  62     None.  This indicates a UTC timestamp that explicitly declaims knowledge of
  63     the source timezone, as opposed to a +0000 timestamp that indicates the
  64     source timezone really was UTC.
  65 
  66     """
  67     if not data:
  68         return
  69     data = data.split()
  70     # The FWS after the comma after the day-of-week is optional, so search and
  71     # adjust for this.
  72     if data[0].endswith(',') or data[0].lower() in _daynames:
  73         # There's a dayname here. Skip it
  74         del data[0]
  75     else:
  76         i = data[0].rfind(',')
  77         if i >= 0:
  78             data[0] = data[0][i+1:]
  79     if len(data) == 3: # RFC 850 date, deprecated
  80         stuff = data[0].split('-')
  81         if len(stuff) == 3:
  82             data = stuff + data[1:]
  83     if len(data) == 4:
  84         s = data[3]
  85         i = s.find('+')
  86         if i == -1:
  87             i = s.find('-')
  88         if i > 0:
  89             data[3:] = [s[:i], s[i:]]
  90         else:
  91             data.append('') # Dummy tz
  92     if len(data) < 5:
  93         return None
  94     data = data[:5]
  95     [dd, mm, yy, tm, tz] = data
  96     mm = mm.lower()
  97     if mm not in _monthnames:
  98         dd, mm = mm, dd.lower()
  99         if mm not in _monthnames:
 100             return None
 101     mm = _monthnames.index(mm) + 1
 102     if mm > 12:
 103         mm -= 12
 104     if dd[-1] == ',':
 105         dd = dd[:-1]
 106     i = yy.find(':')
 107     if i > 0:
 108         yy, tm = tm, yy
 109     if yy[-1] == ',':
 110         yy = yy[:-1]
 111     if not yy[0].isdigit():
 112         yy, tz = tz, yy
 113     if tm[-1] == ',':
 114         tm = tm[:-1]
 115     tm = tm.split(':')
 116     if len(tm) == 2:
 117         [thh, tmm] = tm
 118         tss = '0'
 119     elif len(tm) == 3:
 120         [thh, tmm, tss] = tm
 121     elif len(tm) == 1 and '.' in tm[0]:
 122         # Some non-compliant MUAs use '.' to separate time elements.
 123         tm = tm[0].split('.')
 124         if len(tm) == 2:
 125             [thh, tmm] = tm
 126             tss = 0
 127         elif len(tm) == 3:
 128             [thh, tmm, tss] = tm
 129     else:
 130         return None
 131     try:
 132         yy = int(yy)
 133         dd = int(dd)
 134         thh = int(thh)
 135         tmm = int(tmm)
 136         tss = int(tss)
 137     except ValueError:
 138         return None
 139     # Check for a yy specified in two-digit format, then convert it to the
 140     # appropriate four-digit format, according to the POSIX standard. RFC 822
 141     # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
 142     # mandates a 4-digit yy. For more information, see the documentation for
 143     # the time module.
 144     if yy < 100:
 145         # The year is between 1969 and 1999 (inclusive).
 146         if yy > 68:
 147             yy += 1900
 148         # The year is between 2000 and 2068 (inclusive).
 149         else:
 150             yy += 2000
 151     tzoffset = None
 152     tz = tz.upper()
 153     if tz in _timezones:
 154         tzoffset = _timezones[tz]
 155     else:
 156         try:
 157             tzoffset = int(tz)
 158         except ValueError:
 159             pass
 160         if tzoffset==0 and tz.startswith('-'):
 161             tzoffset = None
 162     # Convert a timezone offset into seconds ; -0500 -> -18000
 163     if tzoffset:
 164         if tzoffset < 0:
 165             tzsign = -1
 166             tzoffset = -tzoffset
 167         else:
 168             tzsign = 1
 169         tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
 170     # Daylight Saving Time flag is set to -1, since DST is unknown.
 171     return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
 172 
 173 
 174 def parsedate(data):
 175     """Convert a time string to a time tuple."""
 176     t = parsedate_tz(data)
 177     if isinstance(t, tuple):
 178         return t[:9]
 179     else:
 180         return t
 181 
 182 
 183 def mktime_tz(data):
 184     """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
 185     if data[9] is None:
 186         # No zone info, so localtime is better assumption than GMT
 187         return time.mktime(data[:8] + (-1,))
 188     else:
 189         t = calendar.timegm(data)
 190         return t - data[9]
 191 
 192 
 193 def quote(str):
 194     """Prepare string to be used in a quoted string.
 195 
 196     Turns backslash and double quote characters into quoted pairs.  These
 197     are the only characters that need to be quoted inside a quoted string.
 198     Does not add the surrounding double quotes.
 199     """
 200     return str.replace('\\', '\\\\').replace('"', '\\"')
 201 
 202 
 203 class AddrlistClass:
 204     """Address parser class by Ben Escoto.
 205 
 206     To understand what this class does, it helps to have a copy of RFC 2822 in
 207     front of you.
 208 
 209     Note: this class interface is deprecated and may be removed in the future.
 210     Use email.utils.AddressList instead.
 211     """
 212 
 213     def __init__(self, field):
 214         """Initialize a new instance.
 215 
 216         `field' is an unparsed address header field, containing
 217         one or more addresses.
 218         """
 219         self.specials = '()<>@,:;.\"[]'
 220         self.pos = 0
 221         self.LWS = ' \t'
 222         self.CR = '\r\n'
 223         self.FWS = self.LWS + self.CR
 224         self.atomends = self.specials + self.LWS + self.CR
 225         # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
 226         # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
 227         # syntax, so allow dots in phrases.
 228         self.phraseends = self.atomends.replace('.', '')
 229         self.field = field
 230         self.commentlist = []
 231 
 232     def gotonext(self):
 233         """Skip white space and extract comments."""
 234         wslist = []
 235         while self.pos < len(self.field):
 236             if self.field[self.pos] in self.LWS + '\n\r':
 237                 if self.field[self.pos] not in '\n\r':
 238                     wslist.append(self.field[self.pos])
 239                 self.pos += 1
 240             elif self.field[self.pos] == '(':
 241                 self.commentlist.append(self.getcomment())
 242             else:
 243                 break
 244         return EMPTYSTRING.join(wslist)
 245 
 246     def getaddrlist(self):
 247         """Parse all addresses.
 248 
 249         Returns a list containing all of the addresses.
 250         """
 251         result = []
 252         while self.pos < len(self.field):
 253             ad = self.getaddress()
 254             if ad:
 255                 result += ad
 256             else:
 257                 result.append(('', ''))
 258         return result
 259 
 260     def getaddress(self):
 261         """Parse the next address."""
 262         self.commentlist = []
 263         self.gotonext()
 264 
 265         oldpos = self.pos
 266         oldcl = self.commentlist
 267         plist = self.getphraselist()
 268 
 269         self.gotonext()
 270         returnlist = []
 271 
 272         if self.pos >= len(self.field):
 273             # Bad email address technically, no domain.
 274             if plist:
 275                 returnlist = [(SPACE.join(self.commentlist), plist[0])]
 276 
 277         elif self.field[self.pos] in '.@':
 278             # email address is just an addrspec
 279             # this isn't very efficient since we start over
 280             self.pos = oldpos
 281             self.commentlist = oldcl
 282             addrspec = self.getaddrspec()
 283             returnlist = [(SPACE.join(self.commentlist), addrspec)]
 284 
 285         elif self.field[self.pos] == ':':
 286             # address is a group
 287             returnlist = []
 288 
 289             fieldlen = len(self.field)
 290             self.pos += 1
 291             while self.pos < len(self.field):
 292                 self.gotonext()
 293                 if self.pos < fieldlen and self.field[self.pos] == ';':
 294                     self.pos += 1
 295                     break
 296                 returnlist = returnlist + self.getaddress()
 297 
 298         elif self.field[self.pos] == '<':
 299             # Address is a phrase then a route addr
 300             routeaddr = self.getrouteaddr()
 301 
 302             if self.commentlist:
 303                 returnlist = [(SPACE.join(plist) + ' (' +
 304                                ' '.join(self.commentlist) + ')', routeaddr)]
 305             else:
 306                 returnlist = [(SPACE.join(plist), routeaddr)]
 307 
 308         else:
 309             if plist:
 310                 returnlist = [(SPACE.join(self.commentlist), plist[0])]
 311             elif self.field[self.pos] in self.specials:
 312                 self.pos += 1
 313 
 314         self.gotonext()
 315         if self.pos < len(self.field) and self.field[self.pos] == ',':
 316             self.pos += 1
 317         return returnlist
 318 
 319     def getrouteaddr(self):
 320         """Parse a route address (Return-path value).
 321 
 322         This method just skips all the route stuff and returns the addrspec.
 323         """
 324         if self.field[self.pos] != '<':
 325             return
 326 
 327         expectroute = False
 328         self.pos += 1
 329         self.gotonext()
 330         adlist = ''
 331         while self.pos < len(self.field):
 332             if expectroute:
 333                 self.getdomain()
 334                 expectroute = False
 335             elif self.field[self.pos] == '>':
 336                 self.pos += 1
 337                 break
 338             elif self.field[self.pos] == '@':
 339                 self.pos += 1
 340                 expectroute = True
 341             elif self.field[self.pos] == ':':
 342                 self.pos += 1
 343             else:
 344                 adlist = self.getaddrspec()
 345                 self.pos += 1
 346                 break
 347             self.gotonext()
 348 
 349         return adlist
 350 
 351     def getaddrspec(self):
 352         """Parse an RFC 2822 addr-spec."""
 353         aslist = []
 354 
 355         self.gotonext()
 356         while self.pos < len(self.field):
 357             preserve_ws = True
 358             if self.field[self.pos] == '.':
 359                 if aslist and not aslist[-1].strip():
 360                     aslist.pop()
 361                 aslist.append('.')
 362                 self.pos += 1
 363                 preserve_ws = False
 364             elif self.field[self.pos] == '"':
 365                 aslist.append('"%s"' % quote(self.getquote()))
 366             elif self.field[self.pos] in self.atomends:
 367                 if aslist and not aslist[-1].strip():
 368                     aslist.pop()
 369                 break
 370             else:
 371                 aslist.append(self.getatom())
 372             ws = self.gotonext()
 373             if preserve_ws and ws:
 374                 aslist.append(ws)
 375 
 376         if self.pos >= len(self.field) or self.field[self.pos] != '@':
 377             return EMPTYSTRING.join(aslist)
 378 
 379         aslist.append('@')
 380         self.pos += 1
 381         self.gotonext()
 382         return EMPTYSTRING.join(aslist) + self.getdomain()
 383 
 384     def getdomain(self):
 385         """Get the complete domain name from an address."""
 386         sdlist = []
 387         while self.pos < len(self.field):
 388             if self.field[self.pos] in self.LWS:
 389                 self.pos += 1
 390             elif self.field[self.pos] == '(':
 391                 self.commentlist.append(self.getcomment())
 392             elif self.field[self.pos] == '[':
 393                 sdlist.append(self.getdomainliteral())
 394             elif self.field[self.pos] == '.':
 395                 self.pos += 1
 396                 sdlist.append('.')
 397             elif self.field[self.pos] in self.atomends:
 398                 break
 399             else:
 400                 sdlist.append(self.getatom())
 401         return EMPTYSTRING.join(sdlist)
 402 
 403     def getdelimited(self, beginchar, endchars, allowcomments=True):
 404         """Parse a header fragment delimited by special characters.
 405 
 406         `beginchar' is the start character for the fragment.
 407         If self is not looking at an instance of `beginchar' then
 408         getdelimited returns the empty string.
 409 
 410         `endchars' is a sequence of allowable end-delimiting characters.
 411         Parsing stops when one of these is encountered.
 412 
 413         If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
 414         within the parsed fragment.
 415         """
 416         if self.field[self.pos] != beginchar:
 417             return ''
 418 
 419         slist = ['']
 420         quote = False
 421         self.pos += 1
 422         while self.pos < len(self.field):
 423             if quote:
 424                 slist.append(self.field[self.pos])
 425                 quote = False
 426             elif self.field[self.pos] in endchars:
 427                 self.pos += 1
 428                 break
 429             elif allowcomments and self.field[self.pos] == '(':
 430                 slist.append(self.getcomment())
 431                 continue        # have already advanced pos from getcomment
 432             elif self.field[self.pos] == '\\':
 433                 quote = True
 434             else:
 435                 slist.append(self.field[self.pos])
 436             self.pos += 1
 437 
 438         return EMPTYSTRING.join(slist)
 439 
 440     def getquote(self):
 441         """Get a quote-delimited fragment from self's field."""
 442         return self.getdelimited('"', '"\r', False)
 443 
 444     def getcomment(self):
 445         """Get a parenthesis-delimited fragment from self's field."""
 446         return self.getdelimited('(', ')\r', True)
 447 
 448     def getdomainliteral(self):
 449         """Parse an RFC 2822 domain-literal."""
 450         return '[%s]' % self.getdelimited('[', ']\r', False)
 451 
 452     def getatom(self, atomends=None):
 453         """Parse an RFC 2822 atom.
 454 
 455         Optional atomends specifies a different set of end token delimiters
 456         (the default is to use self.atomends).  This is used e.g. in
 457         getphraselist() since phrase endings must not include the `.' (which
 458         is legal in phrases)."""
 459         atomlist = ['']
 460         if atomends is None:
 461             atomends = self.atomends
 462 
 463         while self.pos < len(self.field):
 464             if self.field[self.pos] in atomends:
 465                 break
 466             else:
 467                 atomlist.append(self.field[self.pos])
 468             self.pos += 1
 469 
 470         return EMPTYSTRING.join(atomlist)
 471 
 472     def getphraselist(self):
 473         """Parse a sequence of RFC 2822 phrases.
 474 
 475         A phrase is a sequence of words, which are in turn either RFC 2822
 476         atoms or quoted-strings.  Phrases are canonicalized by squeezing all
 477         runs of continuous whitespace into one space.
 478         """
 479         plist = []
 480 
 481         while self.pos < len(self.field):
 482             if self.field[self.pos] in self.FWS:
 483                 self.pos += 1
 484             elif self.field[self.pos] == '"':
 485                 plist.append(self.getquote())
 486             elif self.field[self.pos] == '(':
 487                 self.commentlist.append(self.getcomment())
 488             elif self.field[self.pos] in self.phraseends:
 489                 break
 490             else:
 491                 plist.append(self.getatom(self.phraseends))
 492 
 493         return plist
 494 
 495 class AddressList(AddrlistClass):
 496     """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
 497     def __init__(self, field):
 498         AddrlistClass.__init__(self, field)
 499         if field:
 500             self.addresslist = self.getaddrlist()
 501         else:
 502             self.addresslist = []
 503 
 504     def __len__(self):
 505         return len(self.addresslist)
 506 
 507     def __add__(self, other):
 508         # Set union
 509         newaddr = AddressList(None)
 510         newaddr.addresslist = self.addresslist[:]
 511         for x in other.addresslist:
 512             if not x in self.addresslist:
 513                 newaddr.addresslist.append(x)
 514         return newaddr
 515 
 516     def __iadd__(self, other):
 517         # Set union, in-place
 518         for x in other.addresslist:
 519             if not x in self.addresslist:
 520                 self.addresslist.append(x)
 521         return self
 522 
 523     def __sub__(self, other):
 524         # Set difference
 525         newaddr = AddressList(None)
 526         for x in self.addresslist:
 527             if not x in other.addresslist:
 528                 newaddr.addresslist.append(x)
 529         return newaddr
 530 
 531     def __isub__(self, other):
 532         # Set difference, in-place
 533         for x in other.addresslist:
 534             if x in self.addresslist:
 535                 self.addresslist.remove(x)
 536         return self
 537 
 538     def __getitem__(self, index):
 539         # Make indexing, slices, and 'in' work
 540         return self.addresslist[index]