Inbound: correctly parse long (folded) headers in raw MIME messages

Work around Python 2 email.parser.Parser bug handling RFC5322 folded headers. Fixes problems where long headers in inbound mail (e.g., Subject) get truncated or have unexpected spaces. This change also updates AnymailInboundMessage.parse_raw_mime to use the improved "default" email.policy on Python 3 (rather than the default "compat32" policy). This likely fixes several other parsing bugs that will still affect code running on Python 2. Improves inbound parsing for all ESPs that provide raw MIME email. (Mailgun, Mandrill, SendGrid, SparkPost)
2025-12-20 03:41:05 -05:00 · 2018-03-23 16:56:45 -07:00
parent 0c3e3e9bad
commit 70094cf3bc
2 changed files with 70 additions and 26 deletions
--- a/anymail/inbound.py
+++ b/anymail/inbound.py
@@ -1,6 +1,6 @@
 from base64 import b64decode
-from email import message_from_string
 from email.message import Message
+from email.parser import Parser
 from email.utils import unquote

 import six
@@ -8,36 +8,54 @@ from django.core.files.uploadedfile import SimpleUploadedFile

 from .utils import angle_wrap, get_content_disposition, parse_address_list, parse_rfc2822date

-# Python 2/3.*-compatible email.parser.HeaderParser(policy=email.policy.default)
+# Work around bugs in older versions of email.parser.Parser
 try:
-    # With Python 3.3+ (email6) package, can use HeaderParser with default policy
-    from email.parser import HeaderParser
-    from email.policy import default as accurate_header_unfolding_policy  # vs. compat32
+    # With Python 3.3+ (email6) package, using `policy=email.policy.default`
+    # avoids earlier bugs. (Note that Parser defaults to policy=compat32,
+    # which *preserves* earlier bugs.)
+    from email.policy import default
+
+    class EmailParser(Parser):
+        def __init__(self, _class=None, policy=default):  # don't default to compat32 policy
+            super(EmailParser, self).__init__(_class, policy=policy)

 except ImportError:
-    # Earlier Pythons don't have HeaderParser, and/or try preserve earlier compatibility bugs
-    # by failing to properly unfold headers (see RFC 5322 section 2.2.3)
-    from email.parser import Parser
+    # Pre-Python 3.3 email package: try to work around some bugs
    import re
-    accurate_header_unfolding_policy = object()

-    class HeaderParser(Parser, object):
-        def __init__(self, _class, policy=None):
-            # This "backport" doesn't actually support policies, but we want to ensure
-            # that callers aren't trying to use HeaderParser's default compat32 policy
-            # (which doesn't properly unfold headers)
-            assert policy is accurate_header_unfolding_policy
-            super(HeaderParser, self).__init__(_class)
+    class EmailParser(Parser):
+        def parsestr(self, text, headersonly=False):
+            # Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3).
+            # Help it out by pre-unfolding the headers for it.
+            # This only works for root headers, not ones within a MIME subpart.
+            # (Finding subpart headers requires actually parsing the message.)
+            headers, body = _split_headers_and_body(text)
+            unfolded = "".join([_unfold_headers(headers), body])
+            return Parser.parsestr(self, unfolded, headersonly=headersonly)

-        def parsestr(self, text, headersonly=True):
-            unfolded = self._unfold_headers(text)
-            return super(HeaderParser, self).parsestr(unfolded, headersonly=True)
+    # Note: email.feedparser.headerRE is a more-complicated RE for recognizing headers.
+    # It tries to support defective messages missing a blank line between headers and body
+    # (but introduces other problems, e.g., https://bugs.python.org/issue26686).
+    # Since those messages are already out of spec, this code doesn't worry about them.
+    _body_sep_re = re.compile(r'(\r\n|\r|\n)(\1)')  # "an empty line" allowing CRLF, CR, or LF endings (but not mixed)
+    _header_fold_re = re.compile(r'(\r\n|\r|\n)(?=[ \t])')  # "any CRLF that is immediately followed by WSP"

-        @staticmethod
-        def _unfold_headers(text):
-            # "Unfolding is accomplished by simply removing any CRLF that is immediately followed by WSP"
-            # (WSP is space or tab, and per email.parser semantics, we allow CRLF, CR, or LF endings)
-            return re.sub(r'(\r\n|\r|\n)(?=[ \t])', "", text)
+    def _split_headers_and_body(text):
+        # RFC5322 section 2.1:
+        # "The body ... is separated from the header section by an empty line (i.e., a line with nothing
+        # preceding the CRLF)."  (And per email.parser semantics, this allows CRLF, CR, or LF endings)
+        parts = _body_sep_re.split(text, maxsplit=1)  # [headers, sep, sep, body] or just [headers]
+        try:
+            return "".join(parts[0:2]), "".join(parts[2:])
+        except IndexError:
+            assert len(parts) == 1
+            return parts[0], ""
+
+    def _unfold_headers(text):
+        # RFC5322 section 2.2.3:
+        # "Unfolding is accomplished by simply removing any CRLF that is immediately followed by WSP"
+        # (WSP is space or tab, and per email.parser semantics, this allows CRLF, CR, or LF endings)
+        return _header_fold_re.sub("", text)


 class AnymailInboundMessage(Message, object):  # `object` ensures new-style class in Python 2)
@@ -226,7 +244,7 @@ class AnymailInboundMessage(Message, object):  # `object` ensures new-style clas
    @classmethod
    def parse_raw_mime(cls, s):
        """Returns a new AnymailInboundMessage parsed from str s"""
-        return message_from_string(s, cls)
+        return EmailParser(cls).parsestr(s)

    @classmethod
    def construct(cls, raw_headers=None, from_email=None, to=None, cc=None, subject=None, headers=None,
@@ -252,7 +270,7 @@ class AnymailInboundMessage(Message, object):  # `object` ensures new-style clas
        :return: {AnymailInboundMessage}
        """
        if raw_headers is not None:
-            msg = HeaderParser(cls, policy=accurate_header_unfolding_policy).parsestr(raw_headers)
+            msg = EmailParser(cls).parsestr(raw_headers, headersonly=True)
            msg.set_payload(None)  # headersonly forces an empty string payload, which breaks things later
        else:
            msg = cls()