From 3928f6ea5e3d6c36796b69f86794137b29032ad7 Mon Sep 17 00:00:00 2001 From: medmunds Date: Sun, 1 Apr 2018 14:18:35 -0700 Subject: [PATCH] Inbound: fix charset handling in .text, .html, .get_content_text() Make `AnymailInboundMessage.text`, `.html` and `.get_content_text()` usually do the right thing for non-UTF-8 messages/attachments. Fixes an incorrect UnicodeDecodeError when receiving an (e.g.,) ISO-8859-1 encoded message, and improves handling for inbound messages that were not properly encoded by the sender. * Decode using the message's (or attachments's) declared charset by default (rather than always defaulting to 'utf-8'; you can still override with `get_content_text(charset=...)` * Add `errors` param to `get_content_text()`, defaulting to 'replace'. Mis-encoded messages will now use the Unicode replacement character rather than raising errors. (Use `get_content_text(errors='strict')` for the previous behavior.) --- anymail/inbound.py | 14 ++++++---- docs/inbound.rst | 15 +++++++++-- tests/test_inbound.py | 63 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 84 insertions(+), 8 deletions(-) diff --git a/anymail/inbound.py b/anymail/inbound.py index 11b3f44..40a7cc0 100644 --- a/anymail/inbound.py +++ b/anymail/inbound.py @@ -199,9 +199,7 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas # should themselves be AnymailInboundMessage. for part in self.walk(): if part.get_content_type() == content_type and not part.is_attachment(): - payload = part.get_payload(decode=True) - if payload is not None: - return payload.decode('utf-8') + return part.get_content_text() return None # Backport from Python 3.5 email.message.Message @@ -238,7 +236,7 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas "(perhaps you want as_bytes()?)") return self.get_payload(decode=True) - def get_content_text(self, charset='utf-8'): + def get_content_text(self, charset=None, errors=None): """Return the payload decoded to text""" maintype = self.get_content_maintype() if maintype == 'message': @@ -252,7 +250,13 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas # and it's not clear which one is the "content". raise ValueError("get_content_text() is not valid on multipart messages " "(perhaps you want as_string()?)") - return self.get_payload(decode=True).decode(charset) + else: + payload = self.get_payload(decode=True) + if payload is None: + return payload + charset = charset or self.get_content_charset('US-ASCII') + errors = errors or 'replace' + return payload.decode(charset, errors=errors) def as_uploaded_file(self): """Return the attachment converted to a Django UploadedFile""" diff --git a/docs/inbound.rst b/docs/inbound.rst index 189b569..4e4a964 100644 --- a/docs/inbound.rst +++ b/docs/inbound.rst @@ -363,11 +363,22 @@ have these methods: (Anymail back-ports Python 3.5's :meth:`~email.message.Message.get_content_disposition` method to all supported versions.) - .. method:: get_content_text(charset='utf-8') + .. method:: get_content_text(charset=None, errors='replace') - Returns the content of the attachment decoded to a `str` in the given charset. + Returns the content of the attachment decoded to Unicode text. (This is generally only appropriate for text or message-type attachments.) + If provided, charset will override the attachment's declared charset. (This can be useful + if you know the attachment's :mailheader:`Content-Type` has a missing or incorrect charset.) + + The errors param is as in :meth:`~bytes.decode`. The default "replace" substitutes the + Unicode "replacement character" for any illegal characters in the text. + + .. versionchanged:: 2.1 + + Changed to use attachment's declared charset by default, + and added errors option defaulting to replace. + .. method:: get_content_bytes() Returns the raw content of the attachment as bytes. (This will automatically decode diff --git a/tests/test_inbound.py b/tests/test_inbound.py index aff8215..718c956 100644 --- a/tests/test_inbound.py +++ b/tests/test_inbound.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +import quopri from base64 import b64encode from email.utils import collapse_rfc2231_value from textwrap import dedent @@ -132,7 +133,6 @@ class AnymailInboundMessageConstructionTests(SimpleTestCase): def test_construct_attachments_from_base64_data(self): # This is a fairly common way for ESPs to provide attachment content to webhooks - from base64 import b64encode content = b64encode(SAMPLE_IMAGE_CONTENT) att = AnymailInboundMessage.construct_attachment(content_type="image/png", content=content, base64=True) self.assertEqual(att.get_content_bytes(), SAMPLE_IMAGE_CONTENT) @@ -208,6 +208,67 @@ class AnymailInboundMessageConveniencePropTests(SimpleTestCase): self.assertIsNone(msg.text) self.assertIsNone(msg.html) + def test_body_props_charsets(self): + text_8859_10 = "Detta är det vanliga innehållet".encode("ISO-8859-10") + html_8859_8 = "

HTML זהו תוכן

".encode("ISO-8859-8") + raw = dedent("""\ + MIME-Version: 1.0 + Subject: Charset test + Content-Type: multipart/alternative; boundary="this_is_a_boundary" + + --this_is_a_boundary + Content-Type: text/plain; charset=ISO-8859-10 + Content-Transfer-Encoding: QUOTED-PRINTABLE + + {text} + --this_is_a_boundary + Content-Type: text/html; charset=ISO-8859-8 + Content-Transfer-Encoding: QUOTED-PRINTABLE + + {html} + --this_is_a_boundary-- + """).format( + text=quopri.encodestring(text_8859_10).decode("ASCII"), + html=quopri.encodestring(html_8859_8).decode("ASCII"), + ) + + msg = AnymailInboundMessage.parse_raw_mime(raw) + self.assertEqual(msg.defects, []) + self.assertEqual(msg.text, "Detta är det vanliga innehållet") + self.assertEqual(msg.html, "

HTML זהו תוכן

") + + self.assertEqual(msg.get_payload(0).get_content_bytes(), text_8859_10) + self.assertEqual(msg.get_payload(0).get_content_text(), "Detta är det vanliga innehållet") + self.assertEqual(msg.get_payload(1).get_content_bytes(), html_8859_8) + self.assertEqual(msg.get_payload(1).get_content_text(), "

HTML זהו תוכן

") + + def test_missing_or_invalid_charsets(self): + """get_content_text has options for handling missing/invalid charset declarations""" + raw = dedent("""\ + Subject: Oops, missing charset declaration + Content-Type: text/plain + Content-Transfer-Encoding: quoted-printable + + Algunos programas de correo electr=f3nico est=e1n rotos + """) + msg = AnymailInboundMessage.parse_raw_mime(raw) + self.assertEqual(msg.defects, []) + + # default is charset from Content-Type (or 'utf-8' if missing), errors='replace'; .text uses defaults + self.assertEqual(msg.get_content_text(), + "Algunos programas de correo electr�nico est�n rotos\n") + self.assertEqual(msg.text, "Algunos programas de correo electr�nico est�n rotos\n") + + # can give specific charset if you know headers are wrong/missing + self.assertEqual(msg.get_content_text(charset='ISO-8859-1'), + "Algunos programas de correo electrónico están rotos\n") + + # can change error handling + with self.assertRaises(UnicodeDecodeError): + msg.get_content_text(errors='strict') + self.assertEqual(msg.get_content_text(errors='ignore'), + "Algunos programas de correo electrnico estn rotos\n") + def test_date_props(self): msg = AnymailInboundMessage.construct(headers={ 'Date': "Mon, 23 Oct 2017 17:50:55 -0700"