Inbound: fix charset handling in .text, .html, .get_content_text()

Make `AnymailInboundMessage.text`, `.html` and `.get_content_text()`
usually do the right thing for non-UTF-8 messages/attachments. Fixes
an incorrect UnicodeDecodeError when receiving an (e.g.,) ISO-8859-1
encoded message, and improves handling for inbound messages that were
not properly encoded by the sender.

* Decode using the message's (or attachments's) declared charset
  by default (rather than always defaulting to 'utf-8'; you can
  still override with `get_content_text(charset=...)`
* Add `errors` param to `get_content_text()`, defaulting to 'replace'.
  Mis-encoded messages will now use the Unicode replacement character
  rather than raising errors. (Use `get_content_text(errors='strict')`
  for the previous behavior.)
This commit is contained in:
medmunds
2018-04-01 14:18:35 -07:00
parent 97fc869992
commit 3928f6ea5e
3 changed files with 84 additions and 8 deletions

View File

@@ -199,9 +199,7 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas
# should themselves be AnymailInboundMessage. # should themselves be AnymailInboundMessage.
for part in self.walk(): for part in self.walk():
if part.get_content_type() == content_type and not part.is_attachment(): if part.get_content_type() == content_type and not part.is_attachment():
payload = part.get_payload(decode=True) return part.get_content_text()
if payload is not None:
return payload.decode('utf-8')
return None return None
# Backport from Python 3.5 email.message.Message # Backport from Python 3.5 email.message.Message
@@ -238,7 +236,7 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas
"(perhaps you want as_bytes()?)") "(perhaps you want as_bytes()?)")
return self.get_payload(decode=True) return self.get_payload(decode=True)
def get_content_text(self, charset='utf-8'): def get_content_text(self, charset=None, errors=None):
"""Return the payload decoded to text""" """Return the payload decoded to text"""
maintype = self.get_content_maintype() maintype = self.get_content_maintype()
if maintype == 'message': if maintype == 'message':
@@ -252,7 +250,13 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas
# and it's not clear which one is the "content". # and it's not clear which one is the "content".
raise ValueError("get_content_text() is not valid on multipart messages " raise ValueError("get_content_text() is not valid on multipart messages "
"(perhaps you want as_string()?)") "(perhaps you want as_string()?)")
return self.get_payload(decode=True).decode(charset) else:
payload = self.get_payload(decode=True)
if payload is None:
return payload
charset = charset or self.get_content_charset('US-ASCII')
errors = errors or 'replace'
return payload.decode(charset, errors=errors)
def as_uploaded_file(self): def as_uploaded_file(self):
"""Return the attachment converted to a Django UploadedFile""" """Return the attachment converted to a Django UploadedFile"""

View File

@@ -363,11 +363,22 @@ have these methods:
(Anymail back-ports Python 3.5's :meth:`~email.message.Message.get_content_disposition` (Anymail back-ports Python 3.5's :meth:`~email.message.Message.get_content_disposition`
method to all supported versions.) method to all supported versions.)
.. method:: get_content_text(charset='utf-8') .. method:: get_content_text(charset=None, errors='replace')
Returns the content of the attachment decoded to a `str` in the given charset. Returns the content of the attachment decoded to Unicode text.
(This is generally only appropriate for text or message-type attachments.) (This is generally only appropriate for text or message-type attachments.)
If provided, charset will override the attachment's declared charset. (This can be useful
if you know the attachment's :mailheader:`Content-Type` has a missing or incorrect charset.)
The errors param is as in :meth:`~bytes.decode`. The default "replace" substitutes the
Unicode "replacement character" for any illegal characters in the text.
.. versionchanged:: 2.1
Changed to use attachment's declared charset by default,
and added errors option defaulting to replace.
.. method:: get_content_bytes() .. method:: get_content_bytes()
Returns the raw content of the attachment as bytes. (This will automatically decode Returns the raw content of the attachment as bytes. (This will automatically decode

View File

@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import unicode_literals from __future__ import unicode_literals
import quopri
from base64 import b64encode from base64 import b64encode
from email.utils import collapse_rfc2231_value from email.utils import collapse_rfc2231_value
from textwrap import dedent from textwrap import dedent
@@ -132,7 +133,6 @@ class AnymailInboundMessageConstructionTests(SimpleTestCase):
def test_construct_attachments_from_base64_data(self): def test_construct_attachments_from_base64_data(self):
# This is a fairly common way for ESPs to provide attachment content to webhooks # This is a fairly common way for ESPs to provide attachment content to webhooks
from base64 import b64encode
content = b64encode(SAMPLE_IMAGE_CONTENT) content = b64encode(SAMPLE_IMAGE_CONTENT)
att = AnymailInboundMessage.construct_attachment(content_type="image/png", content=content, base64=True) att = AnymailInboundMessage.construct_attachment(content_type="image/png", content=content, base64=True)
self.assertEqual(att.get_content_bytes(), SAMPLE_IMAGE_CONTENT) self.assertEqual(att.get_content_bytes(), SAMPLE_IMAGE_CONTENT)
@@ -208,6 +208,67 @@ class AnymailInboundMessageConveniencePropTests(SimpleTestCase):
self.assertIsNone(msg.text) self.assertIsNone(msg.text)
self.assertIsNone(msg.html) self.assertIsNone(msg.html)
def test_body_props_charsets(self):
text_8859_10 = "Detta är det vanliga innehållet".encode("ISO-8859-10")
html_8859_8 = "<p>HTML זהו תוכן</p>".encode("ISO-8859-8")
raw = dedent("""\
MIME-Version: 1.0
Subject: Charset test
Content-Type: multipart/alternative; boundary="this_is_a_boundary"
--this_is_a_boundary
Content-Type: text/plain; charset=ISO-8859-10
Content-Transfer-Encoding: QUOTED-PRINTABLE
{text}
--this_is_a_boundary
Content-Type: text/html; charset=ISO-8859-8
Content-Transfer-Encoding: QUOTED-PRINTABLE
{html}
--this_is_a_boundary--
""").format(
text=quopri.encodestring(text_8859_10).decode("ASCII"),
html=quopri.encodestring(html_8859_8).decode("ASCII"),
)
msg = AnymailInboundMessage.parse_raw_mime(raw)
self.assertEqual(msg.defects, [])
self.assertEqual(msg.text, "Detta är det vanliga innehållet")
self.assertEqual(msg.html, "<p>HTML זהו תוכן</p>")
self.assertEqual(msg.get_payload(0).get_content_bytes(), text_8859_10)
self.assertEqual(msg.get_payload(0).get_content_text(), "Detta är det vanliga innehållet")
self.assertEqual(msg.get_payload(1).get_content_bytes(), html_8859_8)
self.assertEqual(msg.get_payload(1).get_content_text(), "<p>HTML זהו תוכן</p>")
def test_missing_or_invalid_charsets(self):
"""get_content_text has options for handling missing/invalid charset declarations"""
raw = dedent("""\
Subject: Oops, missing charset declaration
Content-Type: text/plain
Content-Transfer-Encoding: quoted-printable
Algunos programas de correo electr=f3nico est=e1n rotos
""")
msg = AnymailInboundMessage.parse_raw_mime(raw)
self.assertEqual(msg.defects, [])
# default is charset from Content-Type (or 'utf-8' if missing), errors='replace'; .text uses defaults
self.assertEqual(msg.get_content_text(),
"Algunos programas de correo electr<74>nico est<73>n rotos\n")
self.assertEqual(msg.text, "Algunos programas de correo electr<74>nico est<73>n rotos\n")
# can give specific charset if you know headers are wrong/missing
self.assertEqual(msg.get_content_text(charset='ISO-8859-1'),
"Algunos programas de correo electrónico están rotos\n")
# can change error handling
with self.assertRaises(UnicodeDecodeError):
msg.get_content_text(errors='strict')
self.assertEqual(msg.get_content_text(errors='ignore'),
"Algunos programas de correo electrnico estn rotos\n")
def test_date_props(self): def test_date_props(self):
msg = AnymailInboundMessage.construct(headers={ msg = AnymailInboundMessage.construct(headers={
'Date': "Mon, 23 Oct 2017 17:50:55 -0700" 'Date': "Mon, 23 Oct 2017 17:50:55 -0700"