mirror of
https://github.com/pacnpal/django-anymail.git
synced 2025-12-20 03:41:05 -05:00
Inbound: fix charset handling in .text, .html, .get_content_text()
Make `AnymailInboundMessage.text`, `.html` and `.get_content_text()` usually do the right thing for non-UTF-8 messages/attachments. Fixes an incorrect UnicodeDecodeError when receiving an (e.g.,) ISO-8859-1 encoded message, and improves handling for inbound messages that were not properly encoded by the sender. * Decode using the message's (or attachments's) declared charset by default (rather than always defaulting to 'utf-8'; you can still override with `get_content_text(charset=...)` * Add `errors` param to `get_content_text()`, defaulting to 'replace'. Mis-encoded messages will now use the Unicode replacement character rather than raising errors. (Use `get_content_text(errors='strict')` for the previous behavior.)
This commit is contained in:
@@ -199,9 +199,7 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas
|
|||||||
# should themselves be AnymailInboundMessage.
|
# should themselves be AnymailInboundMessage.
|
||||||
for part in self.walk():
|
for part in self.walk():
|
||||||
if part.get_content_type() == content_type and not part.is_attachment():
|
if part.get_content_type() == content_type and not part.is_attachment():
|
||||||
payload = part.get_payload(decode=True)
|
return part.get_content_text()
|
||||||
if payload is not None:
|
|
||||||
return payload.decode('utf-8')
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Backport from Python 3.5 email.message.Message
|
# Backport from Python 3.5 email.message.Message
|
||||||
@@ -238,7 +236,7 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas
|
|||||||
"(perhaps you want as_bytes()?)")
|
"(perhaps you want as_bytes()?)")
|
||||||
return self.get_payload(decode=True)
|
return self.get_payload(decode=True)
|
||||||
|
|
||||||
def get_content_text(self, charset='utf-8'):
|
def get_content_text(self, charset=None, errors=None):
|
||||||
"""Return the payload decoded to text"""
|
"""Return the payload decoded to text"""
|
||||||
maintype = self.get_content_maintype()
|
maintype = self.get_content_maintype()
|
||||||
if maintype == 'message':
|
if maintype == 'message':
|
||||||
@@ -252,7 +250,13 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas
|
|||||||
# and it's not clear which one is the "content".
|
# and it's not clear which one is the "content".
|
||||||
raise ValueError("get_content_text() is not valid on multipart messages "
|
raise ValueError("get_content_text() is not valid on multipart messages "
|
||||||
"(perhaps you want as_string()?)")
|
"(perhaps you want as_string()?)")
|
||||||
return self.get_payload(decode=True).decode(charset)
|
else:
|
||||||
|
payload = self.get_payload(decode=True)
|
||||||
|
if payload is None:
|
||||||
|
return payload
|
||||||
|
charset = charset or self.get_content_charset('US-ASCII')
|
||||||
|
errors = errors or 'replace'
|
||||||
|
return payload.decode(charset, errors=errors)
|
||||||
|
|
||||||
def as_uploaded_file(self):
|
def as_uploaded_file(self):
|
||||||
"""Return the attachment converted to a Django UploadedFile"""
|
"""Return the attachment converted to a Django UploadedFile"""
|
||||||
|
|||||||
@@ -363,11 +363,22 @@ have these methods:
|
|||||||
(Anymail back-ports Python 3.5's :meth:`~email.message.Message.get_content_disposition`
|
(Anymail back-ports Python 3.5's :meth:`~email.message.Message.get_content_disposition`
|
||||||
method to all supported versions.)
|
method to all supported versions.)
|
||||||
|
|
||||||
.. method:: get_content_text(charset='utf-8')
|
.. method:: get_content_text(charset=None, errors='replace')
|
||||||
|
|
||||||
Returns the content of the attachment decoded to a `str` in the given charset.
|
Returns the content of the attachment decoded to Unicode text.
|
||||||
(This is generally only appropriate for text or message-type attachments.)
|
(This is generally only appropriate for text or message-type attachments.)
|
||||||
|
|
||||||
|
If provided, charset will override the attachment's declared charset. (This can be useful
|
||||||
|
if you know the attachment's :mailheader:`Content-Type` has a missing or incorrect charset.)
|
||||||
|
|
||||||
|
The errors param is as in :meth:`~bytes.decode`. The default "replace" substitutes the
|
||||||
|
Unicode "replacement character" for any illegal characters in the text.
|
||||||
|
|
||||||
|
.. versionchanged:: 2.1
|
||||||
|
|
||||||
|
Changed to use attachment's declared charset by default,
|
||||||
|
and added errors option defaulting to replace.
|
||||||
|
|
||||||
.. method:: get_content_bytes()
|
.. method:: get_content_bytes()
|
||||||
|
|
||||||
Returns the raw content of the attachment as bytes. (This will automatically decode
|
Returns the raw content of the attachment as bytes. (This will automatically decode
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import quopri
|
||||||
from base64 import b64encode
|
from base64 import b64encode
|
||||||
from email.utils import collapse_rfc2231_value
|
from email.utils import collapse_rfc2231_value
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
@@ -132,7 +133,6 @@ class AnymailInboundMessageConstructionTests(SimpleTestCase):
|
|||||||
|
|
||||||
def test_construct_attachments_from_base64_data(self):
|
def test_construct_attachments_from_base64_data(self):
|
||||||
# This is a fairly common way for ESPs to provide attachment content to webhooks
|
# This is a fairly common way for ESPs to provide attachment content to webhooks
|
||||||
from base64 import b64encode
|
|
||||||
content = b64encode(SAMPLE_IMAGE_CONTENT)
|
content = b64encode(SAMPLE_IMAGE_CONTENT)
|
||||||
att = AnymailInboundMessage.construct_attachment(content_type="image/png", content=content, base64=True)
|
att = AnymailInboundMessage.construct_attachment(content_type="image/png", content=content, base64=True)
|
||||||
self.assertEqual(att.get_content_bytes(), SAMPLE_IMAGE_CONTENT)
|
self.assertEqual(att.get_content_bytes(), SAMPLE_IMAGE_CONTENT)
|
||||||
@@ -208,6 +208,67 @@ class AnymailInboundMessageConveniencePropTests(SimpleTestCase):
|
|||||||
self.assertIsNone(msg.text)
|
self.assertIsNone(msg.text)
|
||||||
self.assertIsNone(msg.html)
|
self.assertIsNone(msg.html)
|
||||||
|
|
||||||
|
def test_body_props_charsets(self):
|
||||||
|
text_8859_10 = "Detta är det vanliga innehållet".encode("ISO-8859-10")
|
||||||
|
html_8859_8 = "<p>HTML זהו תוכן</p>".encode("ISO-8859-8")
|
||||||
|
raw = dedent("""\
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Subject: Charset test
|
||||||
|
Content-Type: multipart/alternative; boundary="this_is_a_boundary"
|
||||||
|
|
||||||
|
--this_is_a_boundary
|
||||||
|
Content-Type: text/plain; charset=ISO-8859-10
|
||||||
|
Content-Transfer-Encoding: QUOTED-PRINTABLE
|
||||||
|
|
||||||
|
{text}
|
||||||
|
--this_is_a_boundary
|
||||||
|
Content-Type: text/html; charset=ISO-8859-8
|
||||||
|
Content-Transfer-Encoding: QUOTED-PRINTABLE
|
||||||
|
|
||||||
|
{html}
|
||||||
|
--this_is_a_boundary--
|
||||||
|
""").format(
|
||||||
|
text=quopri.encodestring(text_8859_10).decode("ASCII"),
|
||||||
|
html=quopri.encodestring(html_8859_8).decode("ASCII"),
|
||||||
|
)
|
||||||
|
|
||||||
|
msg = AnymailInboundMessage.parse_raw_mime(raw)
|
||||||
|
self.assertEqual(msg.defects, [])
|
||||||
|
self.assertEqual(msg.text, "Detta är det vanliga innehållet")
|
||||||
|
self.assertEqual(msg.html, "<p>HTML זהו תוכן</p>")
|
||||||
|
|
||||||
|
self.assertEqual(msg.get_payload(0).get_content_bytes(), text_8859_10)
|
||||||
|
self.assertEqual(msg.get_payload(0).get_content_text(), "Detta är det vanliga innehållet")
|
||||||
|
self.assertEqual(msg.get_payload(1).get_content_bytes(), html_8859_8)
|
||||||
|
self.assertEqual(msg.get_payload(1).get_content_text(), "<p>HTML זהו תוכן</p>")
|
||||||
|
|
||||||
|
def test_missing_or_invalid_charsets(self):
|
||||||
|
"""get_content_text has options for handling missing/invalid charset declarations"""
|
||||||
|
raw = dedent("""\
|
||||||
|
Subject: Oops, missing charset declaration
|
||||||
|
Content-Type: text/plain
|
||||||
|
Content-Transfer-Encoding: quoted-printable
|
||||||
|
|
||||||
|
Algunos programas de correo electr=f3nico est=e1n rotos
|
||||||
|
""")
|
||||||
|
msg = AnymailInboundMessage.parse_raw_mime(raw)
|
||||||
|
self.assertEqual(msg.defects, [])
|
||||||
|
|
||||||
|
# default is charset from Content-Type (or 'utf-8' if missing), errors='replace'; .text uses defaults
|
||||||
|
self.assertEqual(msg.get_content_text(),
|
||||||
|
"Algunos programas de correo electr<74>nico est<73>n rotos\n")
|
||||||
|
self.assertEqual(msg.text, "Algunos programas de correo electr<74>nico est<73>n rotos\n")
|
||||||
|
|
||||||
|
# can give specific charset if you know headers are wrong/missing
|
||||||
|
self.assertEqual(msg.get_content_text(charset='ISO-8859-1'),
|
||||||
|
"Algunos programas de correo electrónico están rotos\n")
|
||||||
|
|
||||||
|
# can change error handling
|
||||||
|
with self.assertRaises(UnicodeDecodeError):
|
||||||
|
msg.get_content_text(errors='strict')
|
||||||
|
self.assertEqual(msg.get_content_text(errors='ignore'),
|
||||||
|
"Algunos programas de correo electrnico estn rotos\n")
|
||||||
|
|
||||||
def test_date_props(self):
|
def test_date_props(self):
|
||||||
msg = AnymailInboundMessage.construct(headers={
|
msg = AnymailInboundMessage.construct(headers={
|
||||||
'Date': "Mon, 23 Oct 2017 17:50:55 -0700"
|
'Date': "Mon, 23 Oct 2017 17:50:55 -0700"
|
||||||
|
|||||||
Reference in New Issue
Block a user