1 # Copyright (C) 2004-2006 Python Software Foundation
   2 # Authors: Baxter, Wouters and Warsaw
   3 # Contact: email-sig@python.org
   4 
   5 """FeedParser - An email feed parser.
   6 
   7 The feed parser implements an interface for incrementally parsing an email
   8 message, line by line.  This has advantages for certain applications, such as
   9 those reading email messages off a socket.
  10 
  11 FeedParser.feed() is the primary interface for pushing new data into the
  12 parser.  It returns when there's nothing more it can do with the available
  13 data.  When you have no more data to push into the parser, call .close().
  14 This completes the parsing and returns the root message object.
  15 
  16 The other advantage of this parser is that it will never raise a parsing
  17 exception.  Instead, when it finds something unexpected, it adds a 'defect' to
  18 the current message.  Defects are just instances that live on the message
  19 object's .defects attribute.
  20 """
  21 
  22 __all__ = ['FeedParser', 'BytesFeedParser']
  23 
  24 import re
  25 
  26 from email import errors
  27 from email import message
  28 from email._policybase import compat32
  29 
  30 NLCRE = re.compile('\r\n|\r|\n')
  31 NLCRE_bol = re.compile('(\r\n|\r|\n)')
  32 NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
  33 NLCRE_crack = re.compile('(\r\n|\r|\n)')
  34 # RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
  35 # except controls, SP, and ":".
  36 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
  37 EMPTYSTRING = ''
  38 NL = '\n'
  39 
  40 NeedMoreData = object()
  41 
  42 
  43 
  44 class BufferedSubFile(object):
  45     """A file-ish object that can have new data loaded into it.
  46 
  47     You can also push and pop line-matching predicates onto a stack.  When the
  48     current predicate matches the current line, a false EOF response
  49     (i.e. empty string) is returned instead.  This lets the parser adhere to a
  50     simple abstraction -- it parses until EOF closes the current message.
  51     """
  52     def __init__(self):
  53         # The last partial line pushed into this object.
  54         self._partial = ''
  55         # The list of full, pushed lines, in reverse order
  56         self._lines = []
  57         # The stack of false-EOF checking predicates.
  58         self._eofstack = []
  59         # A flag indicating whether the file has been closed or not.
  60         self._closed = False
  61 
  62     def push_eof_matcher(self, pred):
  63         self._eofstack.append(pred)
  64 
  65     def pop_eof_matcher(self):
  66         return self._eofstack.pop()
  67 
  68     def close(self):
  69         # Don't forget any trailing partial line.
  70         self._lines.append(self._partial)
  71         self._partial = ''
  72         self._closed = True
  73 
  74     def readline(self):
  75         if not self._lines:
  76             if self._closed:
  77                 return ''
  78             return NeedMoreData
  79         # Pop the line off the stack and see if it matches the current
  80         # false-EOF predicate.
  81         line = self._lines.pop()
  82         # RFC 2046, section 5.1.2 requires us to recognize outer level
  83         # boundaries at any level of inner nesting.  Do this, but be sure it's
  84         # in the order of most to least nested.
  85         for ateof in self._eofstack[::-1]:
  86             if ateof(line):
  87                 # We're at the false EOF.  But push the last line back first.
  88                 self._lines.append(line)
  89                 return ''
  90         return line
  91 
  92     def unreadline(self, line):
  93         # Let the consumer push a line back into the buffer.
  94         assert line is not NeedMoreData
  95         self._lines.append(line)
  96 
  97     def push(self, data):
  98         """Push some new data into this object."""
  99         # Handle any previous leftovers
 100         data, self._partial = self._partial + data, ''
 101         # Crack into lines, but preserve the linesep characters on the end of each
 102         parts = data.splitlines(True)
 103         # If the last element of the list does not end in a newline, then treat
 104         # it as a partial line.  We only check for '\n' here because a line
 105         # ending with '\r' might be a line that was split in the middle of a
 106         # '\r\n' sequence (see bugs 1555570 and 1721862).
 107         if parts and not parts[-1].endswith('\n'):
 108             self._partial = parts.pop()
 109         self.pushlines(parts)
 110 
 111     def pushlines(self, lines):
 112         # Reverse and insert at the front of the lines.
 113         self._lines[:0] = lines[::-1]
 114 
 115     def __iter__(self):
 116         return self
 117 
 118     def __next__(self):
 119         line = self.readline()
 120         if line == '':
 121             raise StopIteration
 122         return line
 123 
 124 
 125 
 126 class FeedParser:
 127     """A feed-style parser of email."""
 128 
 129     def __init__(self, _factory=message.Message, *, policy=compat32):
 130         """_factory is called with no arguments to create a new message obj
 131 
 132         The policy keyword specifies a policy object that controls a number of
 133         aspects of the parser's operation.  The default policy maintains
 134         backward compatibility.
 135 
 136         """
 137         self._factory = _factory
 138         self.policy = policy
 139         try:
 140             _factory(policy=self.policy)
 141             self._factory_kwds = lambda: {'policy': self.policy}
 142         except TypeError:
 143             # Assume this is an old-style factory
 144             self._factory_kwds = lambda: {}
 145         self._input = BufferedSubFile()
 146         self._msgstack = []
 147         self._parse = self._parsegen().__next__
 148         self._cur = None
 149         self._last = None
 150         self._headersonly = False
 151 
 152     # Non-public interface for supporting Parser's headersonly flag
 153     def _set_headersonly(self):
 154         self._headersonly = True
 155 
 156     def feed(self, data):
 157         """Push more data into the parser."""
 158         self._input.push(data)
 159         self._call_parse()
 160 
 161     def _call_parse(self):
 162         try:
 163             self._parse()
 164         except StopIteration:
 165             pass
 166 
 167     def close(self):
 168         """Parse all remaining data and return the root message object."""
 169         self._input.close()
 170         self._call_parse()
 171         root = self._pop_message()
 172         assert not self._msgstack
 173         # Look for final set of defects
 174         if root.get_content_maintype() == 'multipart' \
 175                and not root.is_multipart():
 176             defect = errors.MultipartInvariantViolationDefect()
 177             self.policy.handle_defect(root, defect)
 178         return root
 179 
 180     def _new_message(self):
 181         msg = self._factory(**self._factory_kwds())
 182         if self._cur and self._cur.get_content_type() == 'multipart/digest':
 183             msg.set_default_type('message/rfc822')
 184         if self._msgstack:
 185             self._msgstack[-1].attach(msg)
 186         self._msgstack.append(msg)
 187         self._cur = msg
 188         self._last = msg
 189 
 190     def _pop_message(self):
 191         retval = self._msgstack.pop()
 192         if self._msgstack:
 193             self._cur = self._msgstack[-1]
 194         else:
 195             self._cur = None
 196         return retval
 197 
 198     def _parsegen(self):
 199         # Create a new message and start by parsing headers.
 200         self._new_message()
 201         headers = []
 202         # Collect the headers, searching for a line that doesn't match the RFC
 203         # 2822 header or continuation pattern (including an empty line).
 204         for line in self._input:
 205             if line is NeedMoreData:
 206                 yield NeedMoreData
 207                 continue
 208             if not headerRE.match(line):
 209                 # If we saw the RFC defined header/body separator
 210                 # (i.e. newline), just throw it away. Otherwise the line is
 211                 # part of the body so push it back.
 212                 if not NLCRE.match(line):
 213                     defect = errors.MissingHeaderBodySeparatorDefect()
 214                     self.policy.handle_defect(self._cur, defect)
 215                     self._input.unreadline(line)
 216                 break
 217             headers.append(line)
 218         # Done with the headers, so parse them and figure out what we're
 219         # supposed to see in the body of the message.
 220         self._parse_headers(headers)
 221         # Headers-only parsing is a backwards compatibility hack, which was
 222         # necessary in the older parser, which could raise errors.  All
 223         # remaining lines in the input are thrown into the message body.
 224         if self._headersonly:
 225             lines = []
 226             while True:
 227                 line = self._input.readline()
 228                 if line is NeedMoreData:
 229                     yield NeedMoreData
 230                     continue
 231                 if line == '':
 232                     break
 233                 lines.append(line)
 234             self._cur.set_payload(EMPTYSTRING.join(lines))
 235             return
 236         if self._cur.get_content_type() == 'message/delivery-status':
 237             # message/delivery-status contains blocks of headers separated by
 238             # a blank line.  We'll represent each header block as a separate
 239             # nested message object, but the processing is a bit different
 240             # than standard message/* types because there is no body for the
 241             # nested messages.  A blank line separates the subparts.
 242             while True:
 243                 self._input.push_eof_matcher(NLCRE.match)
 244                 for retval in self._parsegen():
 245                     if retval is NeedMoreData:
 246                         yield NeedMoreData
 247                         continue
 248                     break
 249                 msg = self._pop_message()
 250                 # We need to pop the EOF matcher in order to tell if we're at
 251                 # the end of the current file, not the end of the last block
 252                 # of message headers.
 253                 self._input.pop_eof_matcher()
 254                 # The input stream must be sitting at the newline or at the
 255                 # EOF.  We want to see if we're at the end of this subpart, so
 256                 # first consume the blank line, then test the next line to see
 257                 # if we're at this subpart's EOF.
 258                 while True:
 259                     line = self._input.readline()
 260                     if line is NeedMoreData:
 261                         yield NeedMoreData
 262                         continue
 263                     break
 264                 while True:
 265                     line = self._input.readline()
 266                     if line is NeedMoreData:
 267                         yield NeedMoreData
 268                         continue
 269                     break
 270                 if line == '':
 271                     break
 272                 # Not at EOF so this is a line we're going to need.
 273                 self._input.unreadline(line)
 274             return
 275         if self._cur.get_content_maintype() == 'message':
 276             # The message claims to be a message/* type, then what follows is
 277             # another RFC 2822 message.
 278             for retval in self._parsegen():
 279                 if retval is NeedMoreData:
 280                     yield NeedMoreData
 281                     continue
 282                 break
 283             self._pop_message()
 284             return
 285         if self._cur.get_content_maintype() == 'multipart':
 286             boundary = self._cur.get_boundary()
 287             if boundary is None:
 288                 # The message /claims/ to be a multipart but it has not
 289                 # defined a boundary.  That's a problem which we'll handle by
 290                 # reading everything until the EOF and marking the message as
 291                 # defective.
 292                 defect = errors.NoBoundaryInMultipartDefect()
 293                 self.policy.handle_defect(self._cur, defect)
 294                 lines = []
 295                 for line in self._input:
 296                     if line is NeedMoreData:
 297                         yield NeedMoreData
 298                         continue
 299                     lines.append(line)
 300                 self._cur.set_payload(EMPTYSTRING.join(lines))
 301                 return
 302             # Make sure a valid content type was specified per RFC 2045:6.4.
 303             if (self._cur.get('content-transfer-encoding', '8bit').lower()
 304                     not in ('7bit', '8bit', 'binary')):
 305                 defect = errors.InvalidMultipartContentTransferEncodingDefect()
 306                 self.policy.handle_defect(self._cur, defect)
 307             # Create a line match predicate which matches the inter-part
 308             # boundary as well as the end-of-multipart boundary.  Don't push
 309             # this onto the input stream until we've scanned past the
 310             # preamble.
 311             separator = '--' + boundary
 312             boundaryre = re.compile(
 313                 '(?P<sep>' + re.escape(separator) +
 314                 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
 315             capturing_preamble = True
 316             preamble = []
 317             linesep = False
 318             close_boundary_seen = False
 319             while True:
 320                 line = self._input.readline()
 321                 if line is NeedMoreData:
 322                     yield NeedMoreData
 323                     continue
 324                 if line == '':
 325                     break
 326                 mo = boundaryre.match(line)
 327                 if mo:
 328                     # If we're looking at the end boundary, we're done with
 329                     # this multipart.  If there was a newline at the end of
 330                     # the closing boundary, then we need to initialize the
 331                     # epilogue with the empty string (see below).
 332                     if mo.group('end'):
 333                         close_boundary_seen = True
 334                         linesep = mo.group('linesep')
 335                         break
 336                     # We saw an inter-part boundary.  Were we in the preamble?
 337                     if capturing_preamble:
 338                         if preamble:
 339                             # According to RFC 2046, the last newline belongs
 340                             # to the boundary.
 341                             lastline = preamble[-1]
 342                             eolmo = NLCRE_eol.search(lastline)
 343                             if eolmo:
 344                                 preamble[-1] = lastline[:-len(eolmo.group(0))]
 345                             self._cur.preamble = EMPTYSTRING.join(preamble)
 346                         capturing_preamble = False
 347                         self._input.unreadline(line)
 348                         continue
 349                     # We saw a boundary separating two parts.  Consume any
 350                     # multiple boundary lines that may be following.  Our
 351                     # interpretation of RFC 2046 BNF grammar does not produce
 352                     # body parts within such double boundaries.
 353                     while True:
 354                         line = self._input.readline()
 355                         if line is NeedMoreData:
 356                             yield NeedMoreData
 357                             continue
 358                         mo = boundaryre.match(line)
 359                         if not mo:
 360                             self._input.unreadline(line)
 361                             break
 362                     # Recurse to parse this subpart; the input stream points
 363                     # at the subpart's first line.
 364                     self._input.push_eof_matcher(boundaryre.match)
 365                     for retval in self._parsegen():
 366                         if retval is NeedMoreData:
 367                             yield NeedMoreData
 368                             continue
 369                         break
 370                     # Because of RFC 2046, the newline preceding the boundary
 371                     # separator actually belongs to the boundary, not the
 372                     # previous subpart's payload (or epilogue if the previous
 373                     # part is a multipart).
 374                     if self._last.get_content_maintype() == 'multipart':
 375                         epilogue = self._last.epilogue
 376                         if epilogue == '':
 377                             self._last.epilogue = None
 378                         elif epilogue is not None:
 379                             mo = NLCRE_eol.search(epilogue)
 380                             if mo:
 381                                 end = len(mo.group(0))
 382                                 self._last.epilogue = epilogue[:-end]
 383                     else:
 384                         payload = self._last._payload
 385                         if isinstance(payload, str):
 386                             mo = NLCRE_eol.search(payload)
 387                             if mo:
 388                                 payload = payload[:-len(mo.group(0))]
 389                                 self._last._payload = payload
 390                     self._input.pop_eof_matcher()
 391                     self._pop_message()
 392                     # Set the multipart up for newline cleansing, which will
 393                     # happen if we're in a nested multipart.
 394                     self._last = self._cur
 395                 else:
 396                     # I think we must be in the preamble
 397                     assert capturing_preamble
 398                     preamble.append(line)
 399             # We've seen either the EOF or the end boundary.  If we're still
 400             # capturing the preamble, we never saw the start boundary.  Note
 401             # that as a defect and store the captured text as the payload.
 402             if capturing_preamble:
 403                 defect = errors.StartBoundaryNotFoundDefect()
 404                 self.policy.handle_defect(self._cur, defect)
 405                 self._cur.set_payload(EMPTYSTRING.join(preamble))
 406                 epilogue = []
 407                 for line in self._input:
 408                     if line is NeedMoreData:
 409                         yield NeedMoreData
 410                         continue
 411                 self._cur.epilogue = EMPTYSTRING.join(epilogue)
 412                 return
 413             # If we're not processing the preamble, then we might have seen
 414             # EOF without seeing that end boundary...that is also a defect.
 415             if not close_boundary_seen:
 416                 defect = errors.CloseBoundaryNotFoundDefect()
 417                 self.policy.handle_defect(self._cur, defect)
 418                 return
 419             # Everything from here to the EOF is epilogue.  If the end boundary
 420             # ended in a newline, we'll need to make sure the epilogue isn't
 421             # None
 422             if linesep:
 423                 epilogue = ['']
 424             else:
 425                 epilogue = []
 426             for line in self._input:
 427                 if line is NeedMoreData:
 428                     yield NeedMoreData
 429                     continue
 430                 epilogue.append(line)
 431             # Any CRLF at the front of the epilogue is not technically part of
 432             # the epilogue.  Also, watch out for an empty string epilogue,
 433             # which means a single newline.
 434             if epilogue:
 435                 firstline = epilogue[0]
 436                 bolmo = NLCRE_bol.match(firstline)
 437                 if bolmo:
 438                     epilogue[0] = firstline[len(bolmo.group(0)):]
 439             self._cur.epilogue = EMPTYSTRING.join(epilogue)
 440             return
 441         # Otherwise, it's some non-multipart type, so the entire rest of the
 442         # file contents becomes the payload.
 443         lines = []
 444         for line in self._input:
 445             if line is NeedMoreData:
 446                 yield NeedMoreData
 447                 continue
 448             lines.append(line)
 449         self._cur.set_payload(EMPTYSTRING.join(lines))
 450 
 451     def _parse_headers(self, lines):
 452         # Passed a list of lines that make up the headers for the current msg
 453         lastheader = ''
 454         lastvalue = []
 455         for lineno, line in enumerate(lines):
 456             # Check for continuation
 457             if line[0] in ' \t':
 458                 if not lastheader:
 459                     # The first line of the headers was a continuation.  This
 460                     # is illegal, so let's note the defect, store the illegal
 461                     # line, and ignore it for purposes of headers.
 462                     defect = errors.FirstHeaderLineIsContinuationDefect(line)
 463                     self.policy.handle_defect(self._cur, defect)
 464                     continue
 465                 lastvalue.append(line)
 466                 continue
 467             if lastheader:
 468                 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
 469                 lastheader, lastvalue = '', []
 470             # Check for envelope header, i.e. unix-from
 471             if line.startswith('From '):
 472                 if lineno == 0:
 473                     # Strip off the trailing newline
 474                     mo = NLCRE_eol.search(line)
 475                     if mo:
 476                         line = line[:-len(mo.group(0))]
 477                     self._cur.set_unixfrom(line)
 478                     continue
 479                 elif lineno == len(lines) - 1:
 480                     # Something looking like a unix-from at the end - it's
 481                     # probably the first line of the body, so push back the
 482                     # line and stop.
 483                     self._input.unreadline(line)
 484                     return
 485                 else:
 486                     # Weirdly placed unix-from line.  Note this as a defect
 487                     # and ignore it.
 488                     defect = errors.MisplacedEnvelopeHeaderDefect(line)
 489                     self._cur.defects.append(defect)
 490                     continue
 491             # Split the line on the colon separating field name from value.
 492             # There will always be a colon, because if there wasn't the part of
 493             # the parser that calls us would have started parsing the body.
 494             i = line.find(':')
 495             assert i>0, "_parse_headers fed line with no : and no leading WS"
 496             lastheader = line[:i]
 497             lastvalue = [line]
 498         # Done with all the lines, so handle the last header.
 499         if lastheader:
 500             self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
 501 
 502 
 503 class BytesFeedParser(FeedParser):
 504     """Like FeedParser, but feed accepts bytes."""
 505 
 506     def feed(self, data):
 507         super().feed(data.decode('ascii', 'surrogateescape'))