Inbound: correctly parse long (folded) headers in raw MIME messages

Work around Python 2 email.parser.Parser bug handling RFC5322 folded headers. Fixes problems where long headers in inbound mail (e.g., Subject) get truncated or have unexpected spaces. This change also updates AnymailInboundMessage.parse_raw_mime to use the improved "default" email.policy on Python 3 (rather than the default "compat32" policy). This likely fixes several other parsing bugs that will still affect code running on Python 2. Improves inbound parsing for all ESPs that provide raw MIME email. (Mailgun, Mandrill, SendGrid, SparkPost)
2026-02-05 12:05:21 -05:00 · 2018-03-23 16:56:45 -07:00
parent 0c3e3e9bad
commit 70094cf3bc
2 changed files with 70 additions and 26 deletions
--- a/anymail/inbound.py
+++ b/anymail/inbound.py
@@ -1,6 +1,6 @@
 from base64 import b64decode
 from email import message_from_string
 from email.message import Message
 from email.parser import Parser
 from email.utils import unquote
 import six
@@ -8,36 +8,54 @@ from django.core.files.uploadedfile import SimpleUploadedFile
 from .utils import angle_wrap, get_content_disposition, parse_address_list, parse_rfc2822date
-# Python 2/3.*-compatible email.parser.HeaderParser(policy=email.policy.default)
+# Work around bugs in older versions of email.parser.Parser
 try:
-    # With Python 3.3+ (email6) package, can use HeaderParser with default policy
+    # With Python 3.3+ (email6) package, using `policy=email.policy.default`
-    from email.parser import HeaderParser
+    # avoids earlier bugs. (Note that Parser defaults to policy=compat32,
-    from email.policy import default as accurate_header_unfolding_policy  # vs. compat32
+    # which *preserves* earlier bugs.)
    from email.policy import default
    class EmailParser(Parser):
        def __init__(self, _class=None, policy=default):  # don't default to compat32 policy
            super(EmailParser, self).__init__(_class, policy=policy)
 except ImportError:
-    # Earlier Pythons don't have HeaderParser, and/or try preserve earlier compatibility bugs
+    # Pre-Python 3.3 email package: try to work around some bugs
    # by failing to properly unfold headers (see RFC 5322 section 2.2.3)
    from email.parser import Parser
    import re
    accurate_header_unfolding_policy = object()
-    class HeaderParser(Parser, object):
+    class EmailParser(Parser):
-        def __init__(self, _class, policy=None):
+        def parsestr(self, text, headersonly=False):
-            # This "backport" doesn't actually support policies, but we want to ensure
+            # Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3).
-            # that callers aren't trying to use HeaderParser's default compat32 policy
+            # Help it out by pre-unfolding the headers for it.
-            # (which doesn't properly unfold headers)
+            # This only works for root headers, not ones within a MIME subpart.
-            assert policy is accurate_header_unfolding_policy
+            # (Finding subpart headers requires actually parsing the message.)
-            super(HeaderParser, self).__init__(_class)
+            headers, body = _split_headers_and_body(text)
            unfolded = "".join([_unfold_headers(headers), body])
            return Parser.parsestr(self, unfolded, headersonly=headersonly)
-        def parsestr(self, text, headersonly=True):
+    # Note: email.feedparser.headerRE is a more-complicated RE for recognizing headers.
-            unfolded = self._unfold_headers(text)
+    # It tries to support defective messages missing a blank line between headers and body
-            return super(HeaderParser, self).parsestr(unfolded, headersonly=True)
+    # (but introduces other problems, e.g., https://bugs.python.org/issue26686).
    # Since those messages are already out of spec, this code doesn't worry about them.
    _body_sep_re = re.compile(r'(\r\n|\r|\n)(\1)')  # "an empty line" allowing CRLF, CR, or LF endings (but not mixed)
    _header_fold_re = re.compile(r'(\r\n|\r|\n)(?=[ \t])')  # "any CRLF that is immediately followed by WSP"
    def _split_headers_and_body(text):
        # RFC5322 section 2.1:
        # "The body ... is separated from the header section by an empty line (i.e., a line with nothing
        # preceding the CRLF)."  (And per email.parser semantics, this allows CRLF, CR, or LF endings)
        parts = _body_sep_re.split(text, maxsplit=1)  # [headers, sep, sep, body] or just [headers]
        try:
            return "".join(parts[0:2]), "".join(parts[2:])
        except IndexError:
            assert len(parts) == 1
            return parts[0], ""
        @staticmethod
    def _unfold_headers(text):
        # RFC5322 section 2.2.3:
        # "Unfolding is accomplished by simply removing any CRLF that is immediately followed by WSP"
-            # (WSP is space or tab, and per email.parser semantics, we allow CRLF, CR, or LF endings)
+        # (WSP is space or tab, and per email.parser semantics, this allows CRLF, CR, or LF endings)
-            return re.sub(r'(\r\n|\r|\n)(?=[ \t])', "", text)
+        return _header_fold_re.sub("", text)
 class AnymailInboundMessage(Message, object):  # `object` ensures new-style class in Python 2)
@@ -226,7 +244,7 @@ class AnymailInboundMessage(Message, object):  # `object` ensures new-style clas
    @classmethod
    def parse_raw_mime(cls, s):
        """Returns a new AnymailInboundMessage parsed from str s"""
-        return message_from_string(s, cls)
+        return EmailParser(cls).parsestr(s)
    @classmethod
    def construct(cls, raw_headers=None, from_email=None, to=None, cc=None, subject=None, headers=None,
@@ -252,7 +270,7 @@ class AnymailInboundMessage(Message, object):  # `object` ensures new-style clas
        :return: {AnymailInboundMessage}
        """
        if raw_headers is not None:
-            msg = HeaderParser(cls, policy=accurate_header_unfolding_policy).parsestr(raw_headers)
+            msg = EmailParser(cls).parsestr(raw_headers, headersonly=True)
            msg.set_payload(None)  # headersonly forces an empty string payload, which breaks things later
        else:
            msg = cls()
--- a/tests/test_inbound.py
+++ b/tests/test_inbound.py
@@ -1,3 +1,4 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 from base64 import b64encode
@@ -387,3 +388,28 @@ class AnymailInboundMessageAttachedMessageTests(SimpleTestCase):
        self.assertIsInstance(orig_msg, AnymailInboundMessage)
        self.assertEqual(orig_msg['Subject'], "Original message")
        self.assertEqual(orig_msg.get_content_type(), "multipart/related")
 class EmailParserWorkaroundTests(SimpleTestCase):
    # Anymail includes workarounds for (some of) the more problematic bugs
    # in the Python 2 email.parser.Parser.
    def test_parse_folded_headers(self):
        raw = dedent("""\
            Content-Type: text/plain
            Subject: This subject uses
             header folding
            X-Json: {"problematic":
             ["encoded newline\\n",
             "comma,semi;no space"]}
            Not-A-Header: This is the body.
             It is not folded.
            """)
        msg = AnymailInboundMessage.parse_raw_mime(raw)
        self.assertEqual(msg['Subject'], "This subject uses header folding")
        self.assertEqual(msg["X-Json"],
                         '{"problematic": ["encoded newline\\n", "comma,semi;no space"]}')
        self.assertEqual(msg.get_content_text(),
                         "Not-A-Header: This is the body.\n It is not folded.\n")
        self.assertEqual(msg.defects, [])