From 3928f6ea5e3d6c36796b69f86794137b29032ad7 Mon Sep 17 00:00:00 2001
From: medmunds <medmunds@gmail.com>
Date: Sun, 1 Apr 2018 14:18:35 -0700
Subject: [PATCH] Inbound: fix charset handling in .text, .html,
 .get_content_text()

Make `AnymailInboundMessage.text`, `.html` and `.get_content_text()`
usually do the right thing for non-UTF-8 messages/attachments. Fixes
an incorrect UnicodeDecodeError when receiving an (e.g.,) ISO-8859-1
encoded message, and improves handling for inbound messages that were
not properly encoded by the sender.

* Decode using the message's (or attachments's) declared charset
  by default (rather than always defaulting to 'utf-8'; you can
  still override with `get_content_text(charset=...)`
* Add `errors` param to `get_content_text()`, defaulting to 'replace'.
  Mis-encoded messages will now use the Unicode replacement character
  rather than raising errors. (Use `get_content_text(errors='strict')`
  for the previous behavior.)
---
 anymail/inbound.py    | 14 ++++++----
 docs/inbound.rst      | 15 +++++++++--
 tests/test_inbound.py | 63 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 84 insertions(+), 8 deletions(-)

diff --git a/anymail/inbound.py b/anymail/inbound.py
index 11b3f44..40a7cc0 100644
--- a/anymail/inbound.py
+++ b/anymail/inbound.py
@@ -199,9 +199,7 @@ class AnymailInboundMessage(Message, object):  # `object` ensures new-style clas
         # should themselves be AnymailInboundMessage.
         for part in self.walk():
             if part.get_content_type() == content_type and not part.is_attachment():
-                payload = part.get_payload(decode=True)
-                if payload is not None:
-                    return payload.decode('utf-8')
+                return part.get_content_text()
         return None
 
     # Backport from Python 3.5 email.message.Message
@@ -238,7 +236,7 @@ class AnymailInboundMessage(Message, object):  # `object` ensures new-style clas
                              "(perhaps you want as_bytes()?)")
         return self.get_payload(decode=True)
 
-    def get_content_text(self, charset='utf-8'):
+    def get_content_text(self, charset=None, errors=None):
         """Return the payload decoded to text"""
         maintype = self.get_content_maintype()
         if maintype == 'message':
@@ -252,7 +250,13 @@ class AnymailInboundMessage(Message, object):  # `object` ensures new-style clas
             # and it's not clear which one is the "content".
             raise ValueError("get_content_text() is not valid on multipart messages "
                              "(perhaps you want as_string()?)")
-        return self.get_payload(decode=True).decode(charset)
+        else:
+            payload = self.get_payload(decode=True)
+            if payload is None:
+                return payload
+            charset = charset or self.get_content_charset('US-ASCII')
+            errors = errors or 'replace'
+            return payload.decode(charset, errors=errors)
 
     def as_uploaded_file(self):
         """Return the attachment converted to a Django UploadedFile"""
diff --git a/docs/inbound.rst b/docs/inbound.rst
index 189b569..4e4a964 100644
--- a/docs/inbound.rst
+++ b/docs/inbound.rst
@@ -363,11 +363,22 @@ have these methods:
         (Anymail back-ports Python 3.5's :meth:`~email.message.Message.get_content_disposition`
         method to all supported versions.)
 
-    .. method:: get_content_text(charset='utf-8')
+    .. method:: get_content_text(charset=None, errors='replace')
 
-        Returns the content of the attachment decoded to a `str` in the given charset.
+        Returns the content of the attachment decoded to Unicode text.
         (This is generally only appropriate for text or message-type attachments.)
 
+        If provided, charset will override the attachment's declared charset. (This can be useful
+        if you know the attachment's :mailheader:`Content-Type` has a missing or incorrect charset.)
+
+        The errors param is as in :meth:`~bytes.decode`. The default "replace" substitutes the
+        Unicode "replacement character" for any illegal characters in the text.
+
+        .. versionchanged:: 2.1
+
+            Changed to use attachment's declared charset by default,
+            and added errors option defaulting to replace.
+
     .. method:: get_content_bytes()
 
         Returns the raw content of the attachment as bytes. (This will automatically decode
diff --git a/tests/test_inbound.py b/tests/test_inbound.py
index aff8215..718c956 100644
--- a/tests/test_inbound.py
+++ b/tests/test_inbound.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
+import quopri
 from base64 import b64encode
 from email.utils import collapse_rfc2231_value
 from textwrap import dedent
@@ -132,7 +133,6 @@ class AnymailInboundMessageConstructionTests(SimpleTestCase):
 
     def test_construct_attachments_from_base64_data(self):
         # This is a fairly common way for ESPs to provide attachment content to webhooks
-        from base64 import b64encode
         content = b64encode(SAMPLE_IMAGE_CONTENT)
         att = AnymailInboundMessage.construct_attachment(content_type="image/png", content=content, base64=True)
         self.assertEqual(att.get_content_bytes(), SAMPLE_IMAGE_CONTENT)
@@ -208,6 +208,67 @@ class AnymailInboundMessageConveniencePropTests(SimpleTestCase):
         self.assertIsNone(msg.text)
         self.assertIsNone(msg.html)
 
+    def test_body_props_charsets(self):
+        text_8859_10 = "Detta är det vanliga innehållet".encode("ISO-8859-10")
+        html_8859_8 = "<p>HTML זהו תוכן</p>".encode("ISO-8859-8")
+        raw = dedent("""\
+            MIME-Version: 1.0
+            Subject: Charset test
+            Content-Type: multipart/alternative; boundary="this_is_a_boundary"
+
+            --this_is_a_boundary
+            Content-Type: text/plain; charset=ISO-8859-10
+            Content-Transfer-Encoding: QUOTED-PRINTABLE
+
+            {text}
+            --this_is_a_boundary
+            Content-Type: text/html; charset=ISO-8859-8
+            Content-Transfer-Encoding: QUOTED-PRINTABLE
+
+            {html}
+            --this_is_a_boundary--
+            """).format(
+                text=quopri.encodestring(text_8859_10).decode("ASCII"),
+                html=quopri.encodestring(html_8859_8).decode("ASCII"),
+            )
+
+        msg = AnymailInboundMessage.parse_raw_mime(raw)
+        self.assertEqual(msg.defects, [])
+        self.assertEqual(msg.text, "Detta är det vanliga innehållet")
+        self.assertEqual(msg.html, "<p>HTML זהו תוכן</p>")
+
+        self.assertEqual(msg.get_payload(0).get_content_bytes(), text_8859_10)
+        self.assertEqual(msg.get_payload(0).get_content_text(), "Detta är det vanliga innehållet")
+        self.assertEqual(msg.get_payload(1).get_content_bytes(), html_8859_8)
+        self.assertEqual(msg.get_payload(1).get_content_text(), "<p>HTML זהו תוכן</p>")
+
+    def test_missing_or_invalid_charsets(self):
+        """get_content_text has options for handling missing/invalid charset declarations"""
+        raw = dedent("""\
+            Subject: Oops, missing charset declaration
+            Content-Type: text/plain
+            Content-Transfer-Encoding: quoted-printable
+
+            Algunos programas de correo electr=f3nico est=e1n rotos
+            """)
+        msg = AnymailInboundMessage.parse_raw_mime(raw)
+        self.assertEqual(msg.defects, [])
+
+        # default is charset from Content-Type (or 'utf-8' if missing), errors='replace'; .text uses defaults
+        self.assertEqual(msg.get_content_text(),
+                         "Algunos programas de correo electr�nico est�n rotos\n")
+        self.assertEqual(msg.text, "Algunos programas de correo electr�nico est�n rotos\n")
+
+        # can give specific charset if you know headers are wrong/missing
+        self.assertEqual(msg.get_content_text(charset='ISO-8859-1'),
+                         "Algunos programas de correo electrónico están rotos\n")
+
+        # can change error handling
+        with self.assertRaises(UnicodeDecodeError):
+            msg.get_content_text(errors='strict')
+        self.assertEqual(msg.get_content_text(errors='ignore'),
+                         "Algunos programas de correo electrnico estn rotos\n")
+
     def test_date_props(self):
         msg = AnymailInboundMessage.construct(headers={
             'Date': "Mon, 23 Oct 2017 17:50:55 -0700"