Inbound: decode Unicode and other non-ASCII email headers on Python 2

In AnymailInboundMessage, work around Python 2 email.parser.Parser's lack of handling for RFC2047-encoded email headers. (The Python 3 email package already decodes these automatically.) Improves inbound handling on Python 2 for all ESPs that provide raw MIME email or raw headers with inbound events. (Mailgun, Mandrill, SendGrid, SparkPost.)
2026-02-05 20:15:24 -05:00 · 2018-03-24 10:03:18 -07:00
parent 70094cf3bc
commit 3d27e3fe6b
2 changed files with 74 additions and 1 deletions
--- a/anymail/inbound.py
+++ b/anymail/inbound.py
@@ -22,6 +22,7 @@ try:
 except ImportError:
    # Pre-Python 3.3 email package: try to work around some bugs
    import re
+    from email.header import decode_header

    class EmailParser(Parser):
        def parsestr(self, text, headersonly=False):
@@ -31,7 +32,15 @@ except ImportError:
            # (Finding subpart headers requires actually parsing the message.)
            headers, body = _split_headers_and_body(text)
            unfolded = "".join([_unfold_headers(headers), body])
-            return Parser.parsestr(self, unfolded, headersonly=headersonly)
+            message = Parser.parsestr(self, unfolded, headersonly=headersonly)
+
+            # Older Parser doesn't decode RFC2047 headers, so fix them up here.
+            # (Since messsage is fully parsed, can decode headers in all MIME subparts.)
+            for part in message.walk():
+                part._headers = [  # doesn't seem to be a public API to easily replace all headers
+                    (name, _decode_rfc2047(value))
+                    for name, value in part._headers]
+            return message

    # Note: email.feedparser.headerRE is a more-complicated RE for recognizing headers.
    # It tries to support defective messages missing a blank line between headers and body
@@ -57,6 +66,25 @@ except ImportError:
        # (WSP is space or tab, and per email.parser semantics, this allows CRLF, CR, or LF endings)
        return _header_fold_re.sub("", text)

+    def _decode_rfc2047(value):
+        result = value
+        decoded_segments = decode_header(value)
+        if any(charset is not None for raw, charset in decoded_segments):
+            # At least one segment is an RFC2047 encoded-word.
+            # Reassemble the segments into a single decoded string.
+            unicode_segments = []
+            prev_charset = None
+            for raw, charset in decoded_segments:
+                if (charset is None or prev_charset is None) and unicode_segments:
+                    # Transitioning to, from, or between *non*-encoded segments:
+                    # add back inter-segment whitespace that decode_header consumed
+                    unicode_segments.append(u" ")
+                decoded = raw.decode(charset, 'replace') if charset is not None else raw
+                unicode_segments.append(decoded)
+                prev_charset = charset
+            result = u"".join(unicode_segments)
+        return result
+

 class AnymailInboundMessage(Message, object):  # `object` ensures new-style class in Python 2)
    """