1 # Copyright (C) 2004-2006 Python Software Foundation 2 # Authors: Baxter, Wouters and Warsaw 3 # Contact: email-sig@python.org 4 5 """FeedParser - An email feed parser. 6 7 The feed parser implements an interface for incrementally parsing an email 8 message, line by line. This has advantages for certain applications, such as 9 those reading email messages off a socket. 10 11 FeedParser.feed() is the primary interface for pushing new data into the 12 parser. It returns when there's nothing more it can do with the available 13 data. When you have no more data to push into the parser, call .close(). 14 This completes the parsing and returns the root message object. 15 16 The other advantage of this parser is that it will never raise a parsing 17 exception. Instead, when it finds something unexpected, it adds a 'defect' to 18 the current message. Defects are just instances that live on the message 19 object's .defects attribute. 20 """ 21 22 __all__ = ['FeedParser', 'BytesFeedParser'] 23 24 import re 25 26 from email import errors 27 from email import message 28 from email._policybase import compat32 29 30 NLCRE = re.compile('\r\n|\r|\n') 31 NLCRE_bol = re.compile('(\r\n|\r|\n)') 32 NLCRE_eol = re.compile('(\r\n|\r|\n)\Z') 33 NLCRE_crack = re.compile('(\r\n|\r|\n)') 34 # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character 35 # except controls, SP, and ":". 36 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])') 37 EMPTYSTRING = '' 38 NL = '\n' 39 40 NeedMoreData = object() 41 42 43 44 class BufferedSubFile(object): 45 """A file-ish object that can have new data loaded into it. 46 47 You can also push and pop line-matching predicates onto a stack. When the 48 current predicate matches the current line, a false EOF response 49 (i.e. empty string) is returned instead. This lets the parser adhere to a 50 simple abstraction -- it parses until EOF closes the current message. 51 """ 52 def __init__(self): 53 # The last partial line pushed into this object. 54 self._partial = '' 55 # The list of full, pushed lines, in reverse order 56 self._lines = [] 57 # The stack of false-EOF checking predicates. 58 self._eofstack = [] 59 # A flag indicating whether the file has been closed or not. 60 self._closed = False 61 62 def push_eof_matcher(self, pred): 63 self._eofstack.append(pred) 64 65 def pop_eof_matcher(self): 66 return self._eofstack.pop() 67 68 def close(self): 69 # Don't forget any trailing partial line. 70 self._lines.append(self._partial) 71 self._partial = '' 72 self._closed = True 73 74 def readline(self): 75 if not self._lines: 76 if self._closed: 77 return '' 78 return NeedMoreData 79 # Pop the line off the stack and see if it matches the current 80 # false-EOF predicate. 81 line = self._lines.pop() 82 # RFC 2046, section 5.1.2 requires us to recognize outer level 83 # boundaries at any level of inner nesting. Do this, but be sure it's 84 # in the order of most to least nested. 85 for ateof in self._eofstack[::-1]: 86 if ateof(line): 87 # We're at the false EOF. But push the last line back first. 88 self._lines.append(line) 89 return '' 90 return line 91 92 def unreadline(self, line): 93 # Let the consumer push a line back into the buffer. 94 assert line is not NeedMoreData 95 self._lines.append(line) 96 97 def push(self, data): 98 """Push some new data into this object.""" 99 # Handle any previous leftovers 100 data, self._partial = self._partial + data, '' 101 # Crack into lines, but preserve the linesep characters on the end of each 102 parts = data.splitlines(True) 103 # If the last element of the list does not end in a newline, then treat 104 # it as a partial line. We only check for '\n' here because a line 105 # ending with '\r' might be a line that was split in the middle of a 106 # '\r\n' sequence (see bugs 1555570 and 1721862). 107 if parts and not parts[-1].endswith('\n'): 108 self._partial = parts.pop() 109 self.pushlines(parts) 110 111 def pushlines(self, lines): 112 # Reverse and insert at the front of the lines. 113 self._lines[:0] = lines[::-1] 114 115 def __iter__(self): 116 return self 117 118 def __next__(self): 119 line = self.readline() 120 if line == '': 121 raise StopIteration 122 return line 123 124 125 126 class FeedParser: 127 """A feed-style parser of email.""" 128 129 def __init__(self, _factory=message.Message, *, policy=compat32): 130 """_factory is called with no arguments to create a new message obj 131 132 The policy keyword specifies a policy object that controls a number of 133 aspects of the parser's operation. The default policy maintains 134 backward compatibility. 135 136 """ 137 self._factory = _factory 138 self.policy = policy 139 try: 140 _factory(policy=self.policy) 141 self._factory_kwds = lambda: {'policy': self.policy} 142 except TypeError: 143 # Assume this is an old-style factory 144 self._factory_kwds = lambda: {} 145 self._input = BufferedSubFile() 146 self._msgstack = [] 147 self._parse = self._parsegen().__next__ 148 self._cur = None 149 self._last = None 150 self._headersonly = False 151 152 # Non-public interface for supporting Parser's headersonly flag 153 def _set_headersonly(self): 154 self._headersonly = True 155 156 def feed(self, data): 157 """Push more data into the parser.""" 158 self._input.push(data) 159 self._call_parse() 160 161 def _call_parse(self): 162 try: 163 self._parse() 164 except StopIteration: 165 pass 166 167 def close(self): 168 """Parse all remaining data and return the root message object.""" 169 self._input.close() 170 self._call_parse() 171 root = self._pop_message() 172 assert not self._msgstack 173 # Look for final set of defects 174 if root.get_content_maintype() == 'multipart' \ 175 and not root.is_multipart(): 176 defect = errors.MultipartInvariantViolationDefect() 177 self.policy.handle_defect(root, defect) 178 return root 179 180 def _new_message(self): 181 msg = self._factory(**self._factory_kwds()) 182 if self._cur and self._cur.get_content_type() == 'multipart/digest': 183 msg.set_default_type('message/rfc822') 184 if self._msgstack: 185 self._msgstack[-1].attach(msg) 186 self._msgstack.append(msg) 187 self._cur = msg 188 self._last = msg 189 190 def _pop_message(self): 191 retval = self._msgstack.pop() 192 if self._msgstack: 193 self._cur = self._msgstack[-1] 194 else: 195 self._cur = None 196 return retval 197 198 def _parsegen(self): 199 # Create a new message and start by parsing headers. 200 self._new_message() 201 headers = [] 202 # Collect the headers, searching for a line that doesn't match the RFC 203 # 2822 header or continuation pattern (including an empty line). 204 for line in self._input: 205 if line is NeedMoreData: 206 yield NeedMoreData 207 continue 208 if not headerRE.match(line): 209 # If we saw the RFC defined header/body separator 210 # (i.e. newline), just throw it away. Otherwise the line is 211 # part of the body so push it back. 212 if not NLCRE.match(line): 213 defect = errors.MissingHeaderBodySeparatorDefect() 214 self.policy.handle_defect(self._cur, defect) 215 self._input.unreadline(line) 216 break 217 headers.append(line) 218 # Done with the headers, so parse them and figure out what we're 219 # supposed to see in the body of the message. 220 self._parse_headers(headers) 221 # Headers-only parsing is a backwards compatibility hack, which was 222 # necessary in the older parser, which could raise errors. All 223 # remaining lines in the input are thrown into the message body. 224 if self._headersonly: 225 lines = [] 226 while True: 227 line = self._input.readline() 228 if line is NeedMoreData: 229 yield NeedMoreData 230 continue 231 if line == '': 232 break 233 lines.append(line) 234 self._cur.set_payload(EMPTYSTRING.join(lines)) 235 return 236 if self._cur.get_content_type() == 'message/delivery-status': 237 # message/delivery-status contains blocks of headers separated by 238 # a blank line. We'll represent each header block as a separate 239 # nested message object, but the processing is a bit different 240 # than standard message/* types because there is no body for the 241 # nested messages. A blank line separates the subparts. 242 while True: 243 self._input.push_eof_matcher(NLCRE.match) 244 for retval in self._parsegen(): 245 if retval is NeedMoreData: 246 yield NeedMoreData 247 continue 248 break 249 msg = self._pop_message() 250 # We need to pop the EOF matcher in order to tell if we're at 251 # the end of the current file, not the end of the last block 252 # of message headers. 253 self._input.pop_eof_matcher() 254 # The input stream must be sitting at the newline or at the 255 # EOF. We want to see if we're at the end of this subpart, so 256 # first consume the blank line, then test the next line to see 257 # if we're at this subpart's EOF. 258 while True: 259 line = self._input.readline() 260 if line is NeedMoreData: 261 yield NeedMoreData 262 continue 263 break 264 while True: 265 line = self._input.readline() 266 if line is NeedMoreData: 267 yield NeedMoreData 268 continue 269 break 270 if line == '': 271 break 272 # Not at EOF so this is a line we're going to need. 273 self._input.unreadline(line) 274 return 275 if self._cur.get_content_maintype() == 'message': 276 # The message claims to be a message/* type, then what follows is 277 # another RFC 2822 message. 278 for retval in self._parsegen(): 279 if retval is NeedMoreData: 280 yield NeedMoreData 281 continue 282 break 283 self._pop_message() 284 return 285 if self._cur.get_content_maintype() == 'multipart': 286 boundary = self._cur.get_boundary() 287 if boundary is None: 288 # The message /claims/ to be a multipart but it has not 289 # defined a boundary. That's a problem which we'll handle by 290 # reading everything until the EOF and marking the message as 291 # defective. 292 defect = errors.NoBoundaryInMultipartDefect() 293 self.policy.handle_defect(self._cur, defect) 294 lines = [] 295 for line in self._input: 296 if line is NeedMoreData: 297 yield NeedMoreData 298 continue 299 lines.append(line) 300 self._cur.set_payload(EMPTYSTRING.join(lines)) 301 return 302 # Make sure a valid content type was specified per RFC 2045:6.4. 303 if (self._cur.get('content-transfer-encoding', '8bit').lower() 304 not in ('7bit', '8bit', 'binary')): 305 defect = errors.InvalidMultipartContentTransferEncodingDefect() 306 self.policy.handle_defect(self._cur, defect) 307 # Create a line match predicate which matches the inter-part 308 # boundary as well as the end-of-multipart boundary. Don't push 309 # this onto the input stream until we've scanned past the 310 # preamble. 311 separator = '--' + boundary 312 boundaryre = re.compile( 313 '(?P<sep>' + re.escape(separator) + 314 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') 315 capturing_preamble = True 316 preamble = [] 317 linesep = False 318 close_boundary_seen = False 319 while True: 320 line = self._input.readline() 321 if line is NeedMoreData: 322 yield NeedMoreData 323 continue 324 if line == '': 325 break 326 mo = boundaryre.match(line) 327 if mo: 328 # If we're looking at the end boundary, we're done with 329 # this multipart. If there was a newline at the end of 330 # the closing boundary, then we need to initialize the 331 # epilogue with the empty string (see below). 332 if mo.group('end'): 333 close_boundary_seen = True 334 linesep = mo.group('linesep') 335 break 336 # We saw an inter-part boundary. Were we in the preamble? 337 if capturing_preamble: 338 if preamble: 339 # According to RFC 2046, the last newline belongs 340 # to the boundary. 341 lastline = preamble[-1] 342 eolmo = NLCRE_eol.search(lastline) 343 if eolmo: 344 preamble[-1] = lastline[:-len(eolmo.group(0))] 345 self._cur.preamble = EMPTYSTRING.join(preamble) 346 capturing_preamble = False 347 self._input.unreadline(line) 348 continue 349 # We saw a boundary separating two parts. Consume any 350 # multiple boundary lines that may be following. Our 351 # interpretation of RFC 2046 BNF grammar does not produce 352 # body parts within such double boundaries. 353 while True: 354 line = self._input.readline() 355 if line is NeedMoreData: 356 yield NeedMoreData 357 continue 358 mo = boundaryre.match(line) 359 if not mo: 360 self._input.unreadline(line) 361 break 362 # Recurse to parse this subpart; the input stream points 363 # at the subpart's first line. 364 self._input.push_eof_matcher(boundaryre.match) 365 for retval in self._parsegen(): 366 if retval is NeedMoreData: 367 yield NeedMoreData 368 continue 369 break 370 # Because of RFC 2046, the newline preceding the boundary 371 # separator actually belongs to the boundary, not the 372 # previous subpart's payload (or epilogue if the previous 373 # part is a multipart). 374 if self._last.get_content_maintype() == 'multipart': 375 epilogue = self._last.epilogue 376 if epilogue == '': 377 self._last.epilogue = None 378 elif epilogue is not None: 379 mo = NLCRE_eol.search(epilogue) 380 if mo: 381 end = len(mo.group(0)) 382 self._last.epilogue = epilogue[:-end] 383 else: 384 payload = self._last._payload 385 if isinstance(payload, str): 386 mo = NLCRE_eol.search(payload) 387 if mo: 388 payload = payload[:-len(mo.group(0))] 389 self._last._payload = payload 390 self._input.pop_eof_matcher() 391 self._pop_message() 392 # Set the multipart up for newline cleansing, which will 393 # happen if we're in a nested multipart. 394 self._last = self._cur 395 else: 396 # I think we must be in the preamble 397 assert capturing_preamble 398 preamble.append(line) 399 # We've seen either the EOF or the end boundary. If we're still 400 # capturing the preamble, we never saw the start boundary. Note 401 # that as a defect and store the captured text as the payload. 402 if capturing_preamble: 403 defect = errors.StartBoundaryNotFoundDefect() 404 self.policy.handle_defect(self._cur, defect) 405 self._cur.set_payload(EMPTYSTRING.join(preamble)) 406 epilogue = [] 407 for line in self._input: 408 if line is NeedMoreData: 409 yield NeedMoreData 410 continue 411 self._cur.epilogue = EMPTYSTRING.join(epilogue) 412 return 413 # If we're not processing the preamble, then we might have seen 414 # EOF without seeing that end boundary...that is also a defect. 415 if not close_boundary_seen: 416 defect = errors.CloseBoundaryNotFoundDefect() 417 self.policy.handle_defect(self._cur, defect) 418 return 419 # Everything from here to the EOF is epilogue. If the end boundary 420 # ended in a newline, we'll need to make sure the epilogue isn't 421 # None 422 if linesep: 423 epilogue = [''] 424 else: 425 epilogue = [] 426 for line in self._input: 427 if line is NeedMoreData: 428 yield NeedMoreData 429 continue 430 epilogue.append(line) 431 # Any CRLF at the front of the epilogue is not technically part of 432 # the epilogue. Also, watch out for an empty string epilogue, 433 # which means a single newline. 434 if epilogue: 435 firstline = epilogue[0] 436 bolmo = NLCRE_bol.match(firstline) 437 if bolmo: 438 epilogue[0] = firstline[len(bolmo.group(0)):] 439 self._cur.epilogue = EMPTYSTRING.join(epilogue) 440 return 441 # Otherwise, it's some non-multipart type, so the entire rest of the 442 # file contents becomes the payload. 443 lines = [] 444 for line in self._input: 445 if line is NeedMoreData: 446 yield NeedMoreData 447 continue 448 lines.append(line) 449 self._cur.set_payload(EMPTYSTRING.join(lines)) 450 451 def _parse_headers(self, lines): 452 # Passed a list of lines that make up the headers for the current msg 453 lastheader = '' 454 lastvalue = [] 455 for lineno, line in enumerate(lines): 456 # Check for continuation 457 if line[0] in ' \t': 458 if not lastheader: 459 # The first line of the headers was a continuation. This 460 # is illegal, so let's note the defect, store the illegal 461 # line, and ignore it for purposes of headers. 462 defect = errors.FirstHeaderLineIsContinuationDefect(line) 463 self.policy.handle_defect(self._cur, defect) 464 continue 465 lastvalue.append(line) 466 continue 467 if lastheader: 468 self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) 469 lastheader, lastvalue = '', [] 470 # Check for envelope header, i.e. unix-from 471 if line.startswith('From '): 472 if lineno == 0: 473 # Strip off the trailing newline 474 mo = NLCRE_eol.search(line) 475 if mo: 476 line = line[:-len(mo.group(0))] 477 self._cur.set_unixfrom(line) 478 continue 479 elif lineno == len(lines) - 1: 480 # Something looking like a unix-from at the end - it's 481 # probably the first line of the body, so push back the 482 # line and stop. 483 self._input.unreadline(line) 484 return 485 else: 486 # Weirdly placed unix-from line. Note this as a defect 487 # and ignore it. 488 defect = errors.MisplacedEnvelopeHeaderDefect(line) 489 self._cur.defects.append(defect) 490 continue 491 # Split the line on the colon separating field name from value. 492 # There will always be a colon, because if there wasn't the part of 493 # the parser that calls us would have started parsing the body. 494 i = line.find(':') 495 assert i>0, "_parse_headers fed line with no : and no leading WS" 496 lastheader = line[:i] 497 lastvalue = [line] 498 # Done with all the lines, so handle the last header. 499 if lastheader: 500 self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) 501 502 503 class BytesFeedParser(FeedParser): 504 """Like FeedParser, but feed accepts bytes.""" 505 506 def feed(self, data): 507 super().feed(data.decode('ascii', 'surrogateescape')) |