From 3d27e3fe6b81c40659c8143a142918531098766f Mon Sep 17 00:00:00 2001
From: medmunds <medmunds@gmail.com>
Date: Sat, 24 Mar 2018 10:03:18 -0700
Subject: [PATCH] Inbound: decode Unicode and other non-ASCII email headers on
 Python 2

In AnymailInboundMessage, work around Python 2 email.parser.Parser's
lack of handling for RFC2047-encoded email headers. (The Python 3 email
package already decodes these automatically.)

Improves inbound handling on Python 2 for all ESPs that provide raw
MIME email or raw headers with inbound events. (Mailgun, Mandrill,
SendGrid, SparkPost.)
---
 anymail/inbound.py    | 30 ++++++++++++++++++++++++++++-
 tests/test_inbound.py | 45 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/anymail/inbound.py b/anymail/inbound.py
index 770aa77..11b3f44 100644
--- a/anymail/inbound.py
+++ b/anymail/inbound.py
@@ -22,6 +22,7 @@ try:
 except ImportError:
     # Pre-Python 3.3 email package: try to work around some bugs
     import re
+    from email.header import decode_header
 
     class EmailParser(Parser):
         def parsestr(self, text, headersonly=False):
@@ -31,7 +32,15 @@ except ImportError:
             # (Finding subpart headers requires actually parsing the message.)
             headers, body = _split_headers_and_body(text)
             unfolded = "".join([_unfold_headers(headers), body])
-            return Parser.parsestr(self, unfolded, headersonly=headersonly)
+            message = Parser.parsestr(self, unfolded, headersonly=headersonly)
+
+            # Older Parser doesn't decode RFC2047 headers, so fix them up here.
+            # (Since messsage is fully parsed, can decode headers in all MIME subparts.)
+            for part in message.walk():
+                part._headers = [  # doesn't seem to be a public API to easily replace all headers
+                    (name, _decode_rfc2047(value))
+                    for name, value in part._headers]
+            return message
 
     # Note: email.feedparser.headerRE is a more-complicated RE for recognizing headers.
     # It tries to support defective messages missing a blank line between headers and body
@@ -57,6 +66,25 @@ except ImportError:
         # (WSP is space or tab, and per email.parser semantics, this allows CRLF, CR, or LF endings)
         return _header_fold_re.sub("", text)
 
+    def _decode_rfc2047(value):
+        result = value
+        decoded_segments = decode_header(value)
+        if any(charset is not None for raw, charset in decoded_segments):
+            # At least one segment is an RFC2047 encoded-word.
+            # Reassemble the segments into a single decoded string.
+            unicode_segments = []
+            prev_charset = None
+            for raw, charset in decoded_segments:
+                if (charset is None or prev_charset is None) and unicode_segments:
+                    # Transitioning to, from, or between *non*-encoded segments:
+                    # add back inter-segment whitespace that decode_header consumed
+                    unicode_segments.append(u" ")
+                decoded = raw.decode(charset, 'replace') if charset is not None else raw
+                unicode_segments.append(decoded)
+                prev_charset = charset
+            result = u"".join(unicode_segments)
+        return result
+
 
 class AnymailInboundMessage(Message, object):  # `object` ensures new-style class in Python 2)
     """
diff --git a/tests/test_inbound.py b/tests/test_inbound.py
index 334a342..353673c 100644
--- a/tests/test_inbound.py
+++ b/tests/test_inbound.py
@@ -413,3 +413,48 @@ class EmailParserWorkaroundTests(SimpleTestCase):
         self.assertEqual(msg.get_content_text(),
                          "Not-A-Header: This is the body.\n It is not folded.\n")
         self.assertEqual(msg.defects, [])
+
+    def test_parse_encoded_headers(self):
+        # RFC2047 header encoding
+        raw = dedent("""\
+            Content-Type: text/plain
+            From: =?US-ASCII?Q?Keith_Moore?= <moore@example.com>
+            To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@example.com>,
+             =?ISO-8859-1?Q?Andr=E9?= "Pirard, Jr." <PIRARD@example.com>
+            Cc: =?utf-8?b?TmfGsOG7nWkgbmjhuq1u?= <cc@xn--th-e0a.example.com>
+            Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
+             =?utf-8?q?u_understand_the_example=E2=9C=93?=
+            X-Broken: =?utf-8?q?Not_a_char:_=88.?=
+
+            Some examples adapted from http://dogmamix.com/MimeHeadersDecoder/
+            """)
+        msg = AnymailInboundMessage.parse_raw_mime(raw)
+
+        self.assertEqual(msg["From"], "Keith Moore <moore@example.com>")
+        self.assertEqual(msg.from_email.display_name, "Keith Moore")
+        self.assertEqual(msg.from_email.addr_spec, "moore@example.com")
+
+        # When an RFC2047 encoded-word abuts an RFC5322 quoted-word in a *structured* header,
+        # Python 3's parser nicely recombines them into a single quoted word. That's way too
+        # complicated for our Python 2 workaround ...
+        self.assertIn(msg["To"], [  # `To` header will decode to one of these:
+            'Keld Jørn Simonsen <keld@example.com>, "André Pirard, Jr." <PIRARD@example.com>',  # Python 3
+            'Keld Jørn Simonsen <keld@example.com>, André "Pirard, Jr." <PIRARD@example.com>',  # workaround version
+        ])
+        # ... but the two forms are equivalent, and de-structure the same:
+        self.assertEqual(msg.to[0].display_name, "Keld Jørn Simonsen")
+        self.assertEqual(msg.to[1].display_name, "André Pirard, Jr.")  # correct in Python 3 *and* workaround!
+
+        # Note: Like email.headerregistry.Address, Anymail decodes an RFC2047-encoded display_name,
+        # but does not decode a punycode domain. (Use `idna.decode(domain)` if you need that.)
+        self.assertEqual(msg["Cc"], "Người nhận <cc@xn--th-e0a.example.com>")
+        self.assertEqual(msg.cc[0].display_name, "Người nhận")
+        self.assertEqual(msg.cc[0].addr_spec, "cc@xn--th-e0a.example.com")
+        self.assertEqual(msg.cc[0].domain, "xn--th-e0a.example.com")
+
+        # Subject breaks between 'o' and 'u' in the word "you", must be re-joined without space.
+        # Also tests joining encoded words with different charsets:
+        self.assertEqual(msg["Subject"], "If you can read this you understand the example\N{CHECK MARK}")
+
+        # Replace illegal encodings (rather than causing error):
+        self.assertEqual(msg["X-Broken"], "Not a char: \N{REPLACEMENT CHARACTER}.")