Inbound: fix charset handling in .text, .html, .get_content_text()

Make `AnymailInboundMessage.text`, `.html` and `.get_content_text()`
usually do the right thing for non-UTF-8 messages/attachments. Fixes
an incorrect UnicodeDecodeError when receiving an (e.g.,) ISO-8859-1
encoded message, and improves handling for inbound messages that were
not properly encoded by the sender.

* Decode using the message's (or attachments's) declared charset
  by default (rather than always defaulting to 'utf-8'; you can
  still override with `get_content_text(charset=...)`
* Add `errors` param to `get_content_text()`, defaulting to 'replace'.
  Mis-encoded messages will now use the Unicode replacement character
  rather than raising errors. (Use `get_content_text(errors='strict')`
  for the previous behavior.)
This commit is contained in:
medmunds
2018-04-01 14:18:35 -07:00
parent 97fc869992
commit 3928f6ea5e
3 changed files with 84 additions and 8 deletions

View File

@@ -199,9 +199,7 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas
# should themselves be AnymailInboundMessage.
for part in self.walk():
if part.get_content_type() == content_type and not part.is_attachment():
payload = part.get_payload(decode=True)
if payload is not None:
return payload.decode('utf-8')
return part.get_content_text()
return None
# Backport from Python 3.5 email.message.Message
@@ -238,7 +236,7 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas
"(perhaps you want as_bytes()?)")
return self.get_payload(decode=True)
def get_content_text(self, charset='utf-8'):
def get_content_text(self, charset=None, errors=None):
"""Return the payload decoded to text"""
maintype = self.get_content_maintype()
if maintype == 'message':
@@ -252,7 +250,13 @@ class AnymailInboundMessage(Message, object): # `object` ensures new-style clas
# and it's not clear which one is the "content".
raise ValueError("get_content_text() is not valid on multipart messages "
"(perhaps you want as_string()?)")
return self.get_payload(decode=True).decode(charset)
else:
payload = self.get_payload(decode=True)
if payload is None:
return payload
charset = charset or self.get_content_charset('US-ASCII')
errors = errors or 'replace'
return payload.decode(charset, errors=errors)
def as_uploaded_file(self):
"""Return the attachment converted to a Django UploadedFile"""