From 3d27e3fe6b81c40659c8143a142918531098766f Mon Sep 17 00:00:00 2001 From: medmunds Date: Sat, 24 Mar 2018 10:03:18 -0700 Subject: [PATCH] Inbound: decode Unicode and other non-ASCII email headers on Python 2 In AnymailInboundMessage, work around Python 2 email.parser.Parser's lack of handling for RFC2047-encoded email headers. (The Python 3 email package already decodes these automatically.) Improves inbound handling on Python 2 for all ESPs that provide raw MIME email or raw headers with inbound events. (Mailgun, Mandrill, SendGrid, SparkPost.) --- anymail/inbound.py | 30 ++++++++++++++++++++++++++++- tests/test_inbound.py | 45 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/anymail/inbound.py b/anymail/inbound.py index 770aa77..11b3f44 100644 --- a/anymail/inbound.py +++ b/anymail/inbound.py @@ -22,6 +22,7 @@ try: except ImportError: # Pre-Python 3.3 email package: try to work around some bugs import re + from email.header import decode_header class EmailParser(Parser): def parsestr(self, text, headersonly=False): @@ -31,7 +32,15 @@ except ImportError: # (Finding subpart headers requires actually parsing the message.) headers, body = _split_headers_and_body(text) unfolded = "".join([_unfold_headers(headers), body]) - return Parser.parsestr(self, unfolded, headersonly=headersonly) + message = Parser.parsestr(self, unfolded, headersonly=headersonly) + + # Older Parser doesn't decode RFC2047 headers, so fix them up here. + # (Since messsage is fully parsed, can decode headers in all MIME subparts.) + for part in message.walk(): + part._headers = [ # doesn't seem to be a public API to easily replace all headers + (name, _decode_rfc2047(value)) + for name, value in part._headers] + return message # Note: email.feedparser.headerRE is a more-complicated RE for recognizing headers. # It tries to support defective messages missing a blank line between headers and body @@ -57,6 +66,25 @@ except ImportError: # (WSP is space or tab, and per email.parser semantics, this allows CRLF, CR, or LF endings) return _header_fold_re.sub("", text) + def _decode_rfc2047(value): + result = value + decoded_segments = decode_header(value) + if any(charset is not None for raw, charset in decoded_segments): + # At least one segment is an RFC2047 encoded-word. + # Reassemble the segments into a single decoded string. + unicode_segments = [] + prev_charset = None + for raw, charset in decoded_segments: + if (charset is None or prev_charset is None) and unicode_segments: + # Transitioning to, from, or between *non*-encoded segments: + # add back inter-segment whitespace that decode_header consumed + unicode_segments.append(u" ") + decoded = raw.decode(charset, 'replace') if charset is not None else raw + unicode_segments.append(decoded) + prev_charset = charset + result = u"".join(unicode_segments) + return result + class AnymailInboundMessage(Message, object): # `object` ensures new-style class in Python 2) """ diff --git a/tests/test_inbound.py b/tests/test_inbound.py index 334a342..353673c 100644 --- a/tests/test_inbound.py +++ b/tests/test_inbound.py @@ -413,3 +413,48 @@ class EmailParserWorkaroundTests(SimpleTestCase): self.assertEqual(msg.get_content_text(), "Not-A-Header: This is the body.\n It is not folded.\n") self.assertEqual(msg.defects, []) + + def test_parse_encoded_headers(self): + # RFC2047 header encoding + raw = dedent("""\ + Content-Type: text/plain + From: =?US-ASCII?Q?Keith_Moore?= + To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= , + =?ISO-8859-1?Q?Andr=E9?= "Pirard, Jr." + Cc: =?utf-8?b?TmfGsOG7nWkgbmjhuq1u?= + Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?= + =?utf-8?q?u_understand_the_example=E2=9C=93?= + X-Broken: =?utf-8?q?Not_a_char:_=88.?= + + Some examples adapted from http://dogmamix.com/MimeHeadersDecoder/ + """) + msg = AnymailInboundMessage.parse_raw_mime(raw) + + self.assertEqual(msg["From"], "Keith Moore ") + self.assertEqual(msg.from_email.display_name, "Keith Moore") + self.assertEqual(msg.from_email.addr_spec, "moore@example.com") + + # When an RFC2047 encoded-word abuts an RFC5322 quoted-word in a *structured* header, + # Python 3's parser nicely recombines them into a single quoted word. That's way too + # complicated for our Python 2 workaround ... + self.assertIn(msg["To"], [ # `To` header will decode to one of these: + 'Keld Jørn Simonsen , "André Pirard, Jr." ', # Python 3 + 'Keld Jørn Simonsen , André "Pirard, Jr." ', # workaround version + ]) + # ... but the two forms are equivalent, and de-structure the same: + self.assertEqual(msg.to[0].display_name, "Keld Jørn Simonsen") + self.assertEqual(msg.to[1].display_name, "André Pirard, Jr.") # correct in Python 3 *and* workaround! + + # Note: Like email.headerregistry.Address, Anymail decodes an RFC2047-encoded display_name, + # but does not decode a punycode domain. (Use `idna.decode(domain)` if you need that.) + self.assertEqual(msg["Cc"], "Người nhận ") + self.assertEqual(msg.cc[0].display_name, "Người nhận") + self.assertEqual(msg.cc[0].addr_spec, "cc@xn--th-e0a.example.com") + self.assertEqual(msg.cc[0].domain, "xn--th-e0a.example.com") + + # Subject breaks between 'o' and 'u' in the word "you", must be re-joined without space. + # Also tests joining encoded words with different charsets: + self.assertEqual(msg["Subject"], "If you can read this you understand the example\N{CHECK MARK}") + + # Replace illegal encodings (rather than causing error): + self.assertEqual(msg["X-Broken"], "Not a char: \N{REPLACEMENT CHARACTER}.")