From dbe48d48afd0c57b451b64ddf5cc3ab13b5b12af Mon Sep 17 00:00:00 2001 From: medmunds Date: Sun, 1 Apr 2018 15:05:29 -0700 Subject: [PATCH] Inbound: add parse_raw_mime_bytes and parse_raw_mime_file Useful for cases where ESP could send raw 8bit message (and its charset is something other than utf-8). Also reworks earlier Python 2.7 workaround email.parser.Parser header unfolding bugs to handle any text-like, file-like IO stream, without trying to manipulate the entire message as a single string. --- anymail/inbound.py | 116 ++++++++++++++++++++++++++++++++---------- tests/test_inbound.py | 46 ++++++++++++++--- 2 files changed, 126 insertions(+), 36 deletions(-) diff --git a/anymail/inbound.py b/anymail/inbound.py index 40a7cc0..5da235f 100644 --- a/anymail/inbound.py +++ b/anymail/inbound.py @@ -14,25 +14,27 @@ try: # avoids earlier bugs. (Note that Parser defaults to policy=compat32, # which *preserves* earlier bugs.) from email.policy import default + from email.parser import BytesParser class EmailParser(Parser): def __init__(self, _class=None, policy=default): # don't default to compat32 policy super(EmailParser, self).__init__(_class, policy=policy) + class EmailBytesParser(BytesParser): + def __init__(self, _class=None, policy=default): # don't default to compat32 policy + super(EmailBytesParser, self).__init__(_class, policy=policy) + except ImportError: # Pre-Python 3.3 email package: try to work around some bugs - import re from email.header import decode_header + from collections import deque class EmailParser(Parser): - def parsestr(self, text, headersonly=False): + def parse(self, fp, headersonly=False): # Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3). # Help it out by pre-unfolding the headers for it. - # This only works for root headers, not ones within a MIME subpart. - # (Finding subpart headers requires actually parsing the message.) - headers, body = _split_headers_and_body(text) - unfolded = "".join([_unfold_headers(headers), body]) - message = Parser.parsestr(self, unfolded, headersonly=headersonly) + fp = HeaderUnfoldingWrapper(fp) + message = Parser.parse(self, fp, headersonly=headersonly) # Older Parser doesn't decode RFC2047 headers, so fix them up here. # (Since messsage is fully parsed, can decode headers in all MIME subparts.) @@ -42,29 +44,74 @@ except ImportError: for name, value in part._headers] return message - # Note: email.feedparser.headerRE is a more-complicated RE for recognizing headers. - # It tries to support defective messages missing a blank line between headers and body - # (but introduces other problems, e.g., https://bugs.python.org/issue26686). - # Since those messages are already out of spec, this code doesn't worry about them. - _body_sep_re = re.compile(r'(\r\n|\r|\n)(\1)') # "an empty line" allowing CRLF, CR, or LF endings (but not mixed) - _header_fold_re = re.compile(r'(\r\n|\r|\n)(?=[ \t])') # "any CRLF that is immediately followed by WSP" + class EmailBytesParser(EmailParser): + def parsebytes(self, text, headersonly=False): + # In Python 2, bytes is str, and Parser.parsestr uses bytes-friendly cStringIO.StringIO. + return self.parsestr(text, headersonly) - def _split_headers_and_body(text): - # RFC5322 section 2.1: - # "The body ... is separated from the header section by an empty line (i.e., a line with nothing - # preceding the CRLF)." (And per email.parser semantics, this allows CRLF, CR, or LF endings) - parts = _body_sep_re.split(text, maxsplit=1) # [headers, sep, sep, body] or just [headers] - try: - return "".join(parts[0:2]), "".join(parts[2:]) - except IndexError: - assert len(parts) == 1 - return parts[0], "" + class HeaderUnfoldingWrapper: + """ + A wrapper for file-like objects passed to email.parser.Parser.parse which works + around older Parser bugs with folded email headers by pre-unfolding them. - def _unfold_headers(text): - # RFC5322 section 2.2.3: - # "Unfolding is accomplished by simply removing any CRLF that is immediately followed by WSP" - # (WSP is space or tab, and per email.parser semantics, this allows CRLF, CR, or LF endings) - return _header_fold_re.sub("", text) + This only works for headers at the message root, not ones within a MIME subpart. + (Accurately recognizing subpart headers would require parsing mixed-content boundaries.) + """ + + def __init__(self, fp): + self.fp = fp + self._in_headers = True + self._pushback = deque() + + def _readline(self, limit=-1): + try: + line = self._pushback.popleft() + except IndexError: + line = self.fp.readline(limit) + # cStringIO.readline doesn't recognize universal newlines; splitlines does + lines = line.splitlines(True) + if len(lines) > 1: + line = lines[0] + self._pushback.extend(lines[1:]) + return line + + def _peekline(self, limit=-1): + try: + line = self._pushback[0] + except IndexError: + line = self._readline(limit) + self._pushback.appendleft(line) + return line + + def readline(self, limit=-1): + line = self._readline(limit) + if self._in_headers: + line_without_end = line.rstrip("\r\n") # CRLF, CR, or LF -- "universal newlines" + if len(line_without_end) == 0: + # RFC5322 section 2.1: "The body ... is separated from the header section + # by an empty line (i.e., a line with nothing preceding the CRLF)." + self._in_headers = False + else: + # Is this header line folded? Need to check next line... + # RFC5322 section 2.2.3: "Unfolding is accomplished by simply removing any CRLF + # that is immediately followed by WSP." (WSP is space or tab) + next_line = self._peekline(limit) + if next_line.startswith((' ', '\t')): + line = line_without_end + return line + + def read(self, size): + if self._in_headers: + # For simplicity, just read a line at a time while in the header section. + # (This works because we know email.parser.Parser doesn't really care if it reads + # more or less data than it asked for -- it just pushes it into FeedParser either way.) + return self.readline(size) + elif len(self._pushback): + buf = ''.join(self._pushback) + self._pushback.clear() + return buf + else: + return self.fp.read(size) def _decode_rfc2047(value): result = value @@ -278,6 +325,19 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas """Returns a new AnymailInboundMessage parsed from str s""" return EmailParser(cls).parsestr(s) + @classmethod + def parse_raw_mime_bytes(cls, b): + """Returns a new AnymailInboundMessage parsed from bytes b""" + return EmailBytesParser(cls).parsebytes(b) + + @classmethod + def parse_raw_mime_file(cls, fp): + """Returns a new AnymailInboundMessage parsed from file-like object fp""" + if isinstance(fp.read(0), six.binary_type): + return EmailBytesParser(cls).parse(fp) + else: + return EmailParser(cls).parse(fp) + @classmethod def construct(cls, raw_headers=None, from_email=None, to=None, cc=None, subject=None, headers=None, text=None, text_charset='utf-8', html=None, html_charset='utf-8', diff --git a/tests/test_inbound.py b/tests/test_inbound.py index 718c956..697ac87 100644 --- a/tests/test_inbound.py +++ b/tests/test_inbound.py @@ -11,7 +11,7 @@ from django.test import SimpleTestCase from anymail.inbound import AnymailInboundMessage -from .utils import SAMPLE_IMAGE_FILENAME, python_has_broken_mime_param_handling, sample_image_content +from .utils import SAMPLE_IMAGE_FILENAME, python_has_broken_mime_param_handling, sample_email_path, sample_image_content SAMPLE_IMAGE_CONTENT = sample_image_content() @@ -153,6 +153,35 @@ class AnymailInboundMessageConstructionTests(SimpleTestCase): # (see test_attachment_as_uploaded_file below for parsing basic attachment from raw mime) + def test_parse_raw_mime_bytes(self): + raw = ( + b'Content-Type: text/plain; charset=ISO-8859-3\r\n' + b'Content-Transfer-Encoding: 8bit\r\n' + b'Subject: Test bytes\r\n' + b'\r\n' + b'\xD8i estas retpo\xFEto.\r\n') + msg = AnymailInboundMessage.parse_raw_mime_bytes(raw) + self.assertEqual(msg['Subject'], "Test bytes") + self.assertEqual(msg.get_content_text(), "Ĝi estas retpoŝto.\r\n") + self.assertEqual(msg.get_content_bytes(), b'\xD8i estas retpo\xFEto.\r\n') + self.assertEqual(msg.defects, []) + + def test_parse_raw_mime_file_text(self): + with open(sample_email_path(), mode="r") as fp: + msg = AnymailInboundMessage.parse_raw_mime_file(fp) + self.assertEqual(msg["Subject"], "Test email") + self.assertEqual(msg.text, "Hi Bob, This is a message. Thanks!\n") + self.assertEqual(msg.get_all("Received"), [ # this is the first line in the sample email file + "by luna.mailgun.net with SMTP mgrt 8734663311733; Fri, 03 May 2013 18:26:27 +0000"]) + + def test_parse_raw_mime_file_bytes(self): + with open(sample_email_path(), mode="rb") as fp: + msg = AnymailInboundMessage.parse_raw_mime_file(fp) + self.assertEqual(msg["Subject"], "Test email") + self.assertEqual(msg.text, "Hi Bob, This is a message. Thanks!\n") + self.assertEqual(msg.get_all("Received"), [ # this is the first line in the sample email file + "by luna.mailgun.net with SMTP mgrt 8734663311733; Fri, 03 May 2013 18:26:27 +0000"]) + class AnymailInboundMessageConveniencePropTests(SimpleTestCase): # AnymailInboundMessage defines several properties to simplify reading @@ -470,13 +499,14 @@ class EmailParserWorkaroundTests(SimpleTestCase): Not-A-Header: This is the body. It is not folded. """) - msg = AnymailInboundMessage.parse_raw_mime(raw) - self.assertEqual(msg['Subject'], "This subject uses header folding") - self.assertEqual(msg["X-Json"], - '{"problematic": ["encoded newline\\n", "comma,semi;no space"]}') - self.assertEqual(msg.get_content_text(), - "Not-A-Header: This is the body.\n It is not folded.\n") - self.assertEqual(msg.defects, []) + for end in ('\n', '\r', '\r\n'): # check NL, CR, and CRNL line-endings + msg = AnymailInboundMessage.parse_raw_mime(raw.replace('\n', end)) + self.assertEqual(msg['Subject'], "This subject uses header folding") + self.assertEqual(msg["X-Json"], + '{"problematic": ["encoded newline\\n", "comma,semi;no space"]}') + self.assertEqual(msg.get_content_text(), + "Not-A-Header: This is the body.{end} It is not folded.{end}".format(end=end)) + self.assertEqual(msg.defects, []) def test_parse_encoded_headers(self): # RFC2047 header encoding