Inbound: fix charset handling in .text, .html, .get_content_text()

Make `AnymailInboundMessage.text`, `.html` and `.get_content_text()` usually do the right thing for non-UTF-8 messages/attachments. Fixes an incorrect UnicodeDecodeError when receiving an (e.g.,) ISO-8859-1 encoded message, and improves handling for inbound messages that were not properly encoded by the sender. * Decode using the message's (or attachments's) declared charset by default (rather than always defaulting to 'utf-8'; you can still override with `get_content_text(charset=...)` * Add `errors` param to `get_content_text()`, defaulting to 'replace'. Mis-encoded messages will now use the Unicode replacement character rather than raising errors. (Use `get_content_text(errors='strict')` for the previous behavior.)
2026-02-05 12:05:21 -05:00 · 2018-04-01 14:18:35 -07:00
parent 97fc869992
commit 3928f6ea5e
3 changed files with 84 additions and 8 deletions
--- a/anymail/inbound.py
+++ b/anymail/inbound.py
@@ -199,9 +199,7 @@ class AnymailInboundMessage(Message, object):  # `object` ensures new-style clas
        # should themselves be AnymailInboundMessage.
        for part in self.walk():
            if part.get_content_type() == content_type and not part.is_attachment():
-                payload = part.get_payload(decode=True)
+                return part.get_content_text()
                if payload is not None:
                    return payload.decode('utf-8')
        return None
    # Backport from Python 3.5 email.message.Message
@@ -238,7 +236,7 @@ class AnymailInboundMessage(Message, object):  # `object` ensures new-style clas
                             "(perhaps you want as_bytes()?)")
        return self.get_payload(decode=True)
-    def get_content_text(self, charset='utf-8'):
+    def get_content_text(self, charset=None, errors=None):
        """Return the payload decoded to text"""
        maintype = self.get_content_maintype()
        if maintype == 'message':
@@ -252,7 +250,13 @@ class AnymailInboundMessage(Message, object):  # `object` ensures new-style clas
            # and it's not clear which one is the "content".
            raise ValueError("get_content_text() is not valid on multipart messages "
                             "(perhaps you want as_string()?)")
-        return self.get_payload(decode=True).decode(charset)
+        else:
            payload = self.get_payload(decode=True)
            if payload is None:
                return payload
            charset = charset or self.get_content_charset('US-ASCII')
            errors = errors or 'replace'
            return payload.decode(charset, errors=errors)
    def as_uploaded_file(self):
        """Return the attachment converted to a Django UploadedFile"""
--- a/docs/inbound.rst
+++ b/docs/inbound.rst
@@ -363,11 +363,22 @@ have these methods:
        (Anymail back-ports Python 3.5's :meth:`~email.message.Message.get_content_disposition`
        method to all supported versions.)
-    .. method:: get_content_text(charset='utf-8')
+    .. method:: get_content_text(charset=None, errors='replace')
-        Returns the content of the attachment decoded to a `str` in the given charset.
+        Returns the content of the attachment decoded to Unicode text.
        (This is generally only appropriate for text or message-type attachments.)
        If provided, charset will override the attachment's declared charset. (This can be useful
        if you know the attachment's :mailheader:`Content-Type` has a missing or incorrect charset.)
        The errors param is as in :meth:`~bytes.decode`. The default "replace" substitutes the
        Unicode "replacement character" for any illegal characters in the text.
        .. versionchanged:: 2.1
            Changed to use attachment's declared charset by default,
            and added errors option defaulting to replace.
    .. method:: get_content_bytes()
        Returns the raw content of the attachment as bytes. (This will automatically decode
--- a/tests/test_inbound.py
+++ b/tests/test_inbound.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 import quopri
 from base64 import b64encode
 from email.utils import collapse_rfc2231_value
 from textwrap import dedent
@@ -132,7 +133,6 @@ class AnymailInboundMessageConstructionTests(SimpleTestCase):
    def test_construct_attachments_from_base64_data(self):
        # This is a fairly common way for ESPs to provide attachment content to webhooks
        from base64 import b64encode
        content = b64encode(SAMPLE_IMAGE_CONTENT)
        att = AnymailInboundMessage.construct_attachment(content_type="image/png", content=content, base64=True)
        self.assertEqual(att.get_content_bytes(), SAMPLE_IMAGE_CONTENT)
@@ -208,6 +208,67 @@ class AnymailInboundMessageConveniencePropTests(SimpleTestCase):
        self.assertIsNone(msg.text)
        self.assertIsNone(msg.html)
    def test_body_props_charsets(self):
        text_8859_10 = "Detta är det vanliga innehållet".encode("ISO-8859-10")
        html_8859_8 = "<p>HTML זהו תוכן</p>".encode("ISO-8859-8")
        raw = dedent("""\
            MIME-Version: 1.0
            Subject: Charset test
            Content-Type: multipart/alternative; boundary="this_is_a_boundary"
            --this_is_a_boundary
            Content-Type: text/plain; charset=ISO-8859-10
            Content-Transfer-Encoding: QUOTED-PRINTABLE
            {text}
            --this_is_a_boundary
            Content-Type: text/html; charset=ISO-8859-8
            Content-Transfer-Encoding: QUOTED-PRINTABLE
            {html}
            --this_is_a_boundary--
            """).format(
                text=quopri.encodestring(text_8859_10).decode("ASCII"),
                html=quopri.encodestring(html_8859_8).decode("ASCII"),
            )
        msg = AnymailInboundMessage.parse_raw_mime(raw)
        self.assertEqual(msg.defects, [])
        self.assertEqual(msg.text, "Detta är det vanliga innehållet")
        self.assertEqual(msg.html, "<p>HTML זהו תוכן</p>")
        self.assertEqual(msg.get_payload(0).get_content_bytes(), text_8859_10)
        self.assertEqual(msg.get_payload(0).get_content_text(), "Detta är det vanliga innehållet")
        self.assertEqual(msg.get_payload(1).get_content_bytes(), html_8859_8)
        self.assertEqual(msg.get_payload(1).get_content_text(), "<p>HTML זהו תוכן</p>")
    def test_missing_or_invalid_charsets(self):
        """get_content_text has options for handling missing/invalid charset declarations"""
        raw = dedent("""\
            Subject: Oops, missing charset declaration
            Content-Type: text/plain
            Content-Transfer-Encoding: quoted-printable
            Algunos programas de correo electr=f3nico est=e1n rotos
            """)
        msg = AnymailInboundMessage.parse_raw_mime(raw)
        self.assertEqual(msg.defects, [])
        # default is charset from Content-Type (or 'utf-8' if missing), errors='replace'; .text uses defaults
        self.assertEqual(msg.get_content_text(),
                         "Algunos programas de correo electr<74>nico est<73>n rotos\n")
        self.assertEqual(msg.text, "Algunos programas de correo electr<74>nico est<73>n rotos\n")
        # can give specific charset if you know headers are wrong/missing
        self.assertEqual(msg.get_content_text(charset='ISO-8859-1'),
                         "Algunos programas de correo electrónico están rotos\n")
        # can change error handling
        with self.assertRaises(UnicodeDecodeError):
            msg.get_content_text(errors='strict')
        self.assertEqual(msg.get_content_text(errors='ignore'),
                         "Algunos programas de correo electrnico estn rotos\n")
    def test_date_props(self):
        msg = AnymailInboundMessage.construct(headers={
            'Date': "Mon, 23 Oct 2017 17:50:55 -0700"