Inbound: correctly parse long (folded) headers in raw MIME messages

Work around Python 2 email.parser.Parser bug handling RFC5322 folded
headers. Fixes problems where long headers in inbound mail (e.g.,
Subject) get truncated or have unexpected spaces.

This change also updates AnymailInboundMessage.parse_raw_mime to use
the improved "default" email.policy on Python 3 (rather than the
default "compat32" policy). This likely fixes several other parsing
bugs that will still affect code running on Python 2.

Improves inbound parsing for all ESPs that provide raw MIME email.
(Mailgun, Mandrill, SendGrid, SparkPost)
This commit is contained in:
medmunds
2018-03-23 16:56:45 -07:00
parent 0c3e3e9bad
commit 70094cf3bc
2 changed files with 70 additions and 26 deletions

View File

@@ -1,6 +1,6 @@
from base64 import b64decode from base64 import b64decode
from email import message_from_string
from email.message import Message from email.message import Message
from email.parser import Parser
from email.utils import unquote from email.utils import unquote
import six import six
@@ -8,36 +8,54 @@ from django.core.files.uploadedfile import SimpleUploadedFile
from .utils import angle_wrap, get_content_disposition, parse_address_list, parse_rfc2822date from .utils import angle_wrap, get_content_disposition, parse_address_list, parse_rfc2822date
# Python 2/3.*-compatible email.parser.HeaderParser(policy=email.policy.default) # Work around bugs in older versions of email.parser.Parser
try: try:
# With Python 3.3+ (email6) package, can use HeaderParser with default policy # With Python 3.3+ (email6) package, using `policy=email.policy.default`
from email.parser import HeaderParser # avoids earlier bugs. (Note that Parser defaults to policy=compat32,
from email.policy import default as accurate_header_unfolding_policy # vs. compat32 # which *preserves* earlier bugs.)
from email.policy import default
class EmailParser(Parser):
def __init__(self, _class=None, policy=default): # don't default to compat32 policy
super(EmailParser, self).__init__(_class, policy=policy)
except ImportError: except ImportError:
# Earlier Pythons don't have HeaderParser, and/or try preserve earlier compatibility bugs # Pre-Python 3.3 email package: try to work around some bugs
# by failing to properly unfold headers (see RFC 5322 section 2.2.3)
from email.parser import Parser
import re import re
accurate_header_unfolding_policy = object()
class HeaderParser(Parser, object): class EmailParser(Parser):
def __init__(self, _class, policy=None): def parsestr(self, text, headersonly=False):
# This "backport" doesn't actually support policies, but we want to ensure # Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3).
# that callers aren't trying to use HeaderParser's default compat32 policy # Help it out by pre-unfolding the headers for it.
# (which doesn't properly unfold headers) # This only works for root headers, not ones within a MIME subpart.
assert policy is accurate_header_unfolding_policy # (Finding subpart headers requires actually parsing the message.)
super(HeaderParser, self).__init__(_class) headers, body = _split_headers_and_body(text)
unfolded = "".join([_unfold_headers(headers), body])
return Parser.parsestr(self, unfolded, headersonly=headersonly)
def parsestr(self, text, headersonly=True): # Note: email.feedparser.headerRE is a more-complicated RE for recognizing headers.
unfolded = self._unfold_headers(text) # It tries to support defective messages missing a blank line between headers and body
return super(HeaderParser, self).parsestr(unfolded, headersonly=True) # (but introduces other problems, e.g., https://bugs.python.org/issue26686).
# Since those messages are already out of spec, this code doesn't worry about them.
_body_sep_re = re.compile(r'(\r\n|\r|\n)(\1)') # "an empty line" allowing CRLF, CR, or LF endings (but not mixed)
_header_fold_re = re.compile(r'(\r\n|\r|\n)(?=[ \t])') # "any CRLF that is immediately followed by WSP"
def _split_headers_and_body(text):
# RFC5322 section 2.1:
# "The body ... is separated from the header section by an empty line (i.e., a line with nothing
# preceding the CRLF)." (And per email.parser semantics, this allows CRLF, CR, or LF endings)
parts = _body_sep_re.split(text, maxsplit=1) # [headers, sep, sep, body] or just [headers]
try:
return "".join(parts[0:2]), "".join(parts[2:])
except IndexError:
assert len(parts) == 1
return parts[0], ""
@staticmethod
def _unfold_headers(text): def _unfold_headers(text):
# RFC5322 section 2.2.3:
# "Unfolding is accomplished by simply removing any CRLF that is immediately followed by WSP" # "Unfolding is accomplished by simply removing any CRLF that is immediately followed by WSP"
# (WSP is space or tab, and per email.parser semantics, we allow CRLF, CR, or LF endings) # (WSP is space or tab, and per email.parser semantics, this allows CRLF, CR, or LF endings)
return re.sub(r'(\r\n|\r|\n)(?=[ \t])', "", text) return _header_fold_re.sub("", text)
class AnymailInboundMessage(Message, object): # `object` ensures new-style class in Python 2) class AnymailInboundMessage(Message, object): # `object` ensures new-style class in Python 2)
@@ -226,7 +244,7 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas
@classmethod @classmethod
def parse_raw_mime(cls, s): def parse_raw_mime(cls, s):
"""Returns a new AnymailInboundMessage parsed from str s""" """Returns a new AnymailInboundMessage parsed from str s"""
return message_from_string(s, cls) return EmailParser(cls).parsestr(s)
@classmethod @classmethod
def construct(cls, raw_headers=None, from_email=None, to=None, cc=None, subject=None, headers=None, def construct(cls, raw_headers=None, from_email=None, to=None, cc=None, subject=None, headers=None,
@@ -252,7 +270,7 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas
:return: {AnymailInboundMessage} :return: {AnymailInboundMessage}
""" """
if raw_headers is not None: if raw_headers is not None:
msg = HeaderParser(cls, policy=accurate_header_unfolding_policy).parsestr(raw_headers) msg = EmailParser(cls).parsestr(raw_headers, headersonly=True)
msg.set_payload(None) # headersonly forces an empty string payload, which breaks things later msg.set_payload(None) # headersonly forces an empty string payload, which breaks things later
else: else:
msg = cls() msg = cls()

View File

@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals from __future__ import unicode_literals
from base64 import b64encode from base64 import b64encode
@@ -387,3 +388,28 @@ class AnymailInboundMessageAttachedMessageTests(SimpleTestCase):
self.assertIsInstance(orig_msg, AnymailInboundMessage) self.assertIsInstance(orig_msg, AnymailInboundMessage)
self.assertEqual(orig_msg['Subject'], "Original message") self.assertEqual(orig_msg['Subject'], "Original message")
self.assertEqual(orig_msg.get_content_type(), "multipart/related") self.assertEqual(orig_msg.get_content_type(), "multipart/related")
class EmailParserWorkaroundTests(SimpleTestCase):
# Anymail includes workarounds for (some of) the more problematic bugs
# in the Python 2 email.parser.Parser.
def test_parse_folded_headers(self):
raw = dedent("""\
Content-Type: text/plain
Subject: This subject uses
header folding
X-Json: {"problematic":
["encoded newline\\n",
"comma,semi;no space"]}
Not-A-Header: This is the body.
It is not folded.
""")
msg = AnymailInboundMessage.parse_raw_mime(raw)
self.assertEqual(msg['Subject'], "This subject uses header folding")
self.assertEqual(msg["X-Json"],
'{"problematic": ["encoded newline\\n", "comma,semi;no space"]}')
self.assertEqual(msg.get_content_text(),
"Not-A-Header: This is the body.\n It is not folded.\n")
self.assertEqual(msg.defects, [])