diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cca8edd..87ea53b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -42,6 +42,11 @@ Fixes * **Mailjet:** Fix TypeError when sending to or from addresses with display names containing commas (introduced in Django 2.2.15, 3.0.9, and 3.1). +* **SendGrid:** Fix UnicodeError in inbound webhook, when receiving message using + charsets other than utf-8, and *not* using SendGrid's "post raw" inbound parse + option. Also update docs to recommend "post raw" with SendGrid inbound. (Thanks to + `@tcourtqtm`_ for reporting the issue.) + Features ~~~~~~~~ @@ -1104,6 +1109,7 @@ Features .. _@sebbacon: https://github.com/sebbacon .. _@swrobel: https://github.com/swrobel .. _@Thorbenl: https://github.com/Thorbenl +.. _@tcourtqtm: https://github.com/tcourtqtm .. _@varche1: https://github.com/varche1 .. _@vgrebenschikov: https://github.com/vgrebenschikov .. _@yourcelf: https://github.com/yourcelf diff --git a/anymail/webhooks/sendgrid.py b/anymail/webhooks/sendgrid.py index b8d94fe..c3d7322 100644 --- a/anymail/webhooks/sendgrid.py +++ b/anymail/webhooks/sendgrid.py @@ -4,6 +4,7 @@ from datetime import datetime from django.utils.timezone import utc from .base import AnymailBaseWebhookView +from .._email_compat import EmailBytesParser from ..inbound import AnymailInboundMessage from ..signals import inbound, tracking, AnymailInboundEvent, AnymailTrackingEvent, EventType, RejectReason @@ -131,6 +132,9 @@ class SendGridInboundWebhookView(AnymailBaseWebhookView): # Inbound uses the entire Django request as esp_event, because we need POST and FILES. # Note that request.POST is case-sensitive (unlike email.message.Message headers). esp_event = request + # Must access body before any POST fields, or it won't be available if we need + # it later (see text_charset and html_charset handling below). + _ensure_body_is_available_later = request.body # noqa: F841 if 'headers' in request.POST: # Default (not "Send Raw") inbound fields message = self.message_from_sendgrid_parsed(esp_event) @@ -183,11 +187,33 @@ class SendGridInboundWebhookView(AnymailBaseWebhookView): for att_id in sorted(attachment_info.keys()) ] + default_charset = request.POST.encoding.lower() # (probably utf-8) + text = request.POST.get('text') + text_charset = charsets.get('text', default_charset).lower() + html = request.POST.get('html') + html_charset = charsets.get('html', default_charset).lower() + if (text and text_charset != default_charset) or (html and html_charset != default_charset): + # Django has parsed text and/or html fields using the wrong charset. + # We need to re-parse the raw form data and decode each field separately, + # using the indicated charsets. The email package parses multipart/form-data + # retaining bytes content. (In theory, we could instead just change + # request.encoding and access the POST fields again, per Django docs, + # but that seems to be have bugs around the cached request._files.) + raw_data = b"".join([ + b"Content-Type: ", request.META['CONTENT_TYPE'].encode('ascii'), + b"\r\n\r\n", + request.body + ]) + parsed_parts = EmailBytesParser().parsebytes(raw_data).get_payload() + for part in parsed_parts: + name = part.get_param('name', header='content-disposition') + if name == 'text': + text = part.get_payload(decode=True).decode(text_charset) + elif name == 'html': + html = part.get_payload(decode=True).decode(html_charset) + # (subject, from, to, etc. are parsed from raw headers field, + # so no need to worry about their separate POST field charsets) + return AnymailInboundMessage.construct( raw_headers=request.POST.get('headers', ""), # includes From, To, Cc, Subject, etc. - text=request.POST.get('text', None), - text_charset=charsets.get('text', 'utf-8'), - html=request.POST.get('html', None), - html_charset=charsets.get('html', 'utf-8'), - attachments=attachments, - ) + text=text, html=html, attachments=attachments) diff --git a/docs/esps/sendgrid.rst b/docs/esps/sendgrid.rst index 5469fa5..bc10def 100644 --- a/docs/esps/sendgrid.rst +++ b/docs/esps/sendgrid.rst @@ -426,10 +426,10 @@ If you want to use Anymail's normalized :attr:`~anymail.inbound.AnymailInboundMe :attr:`~anymail.inbound.AnymailInboundMessage.spam_score` attributes, be sure to enable the "Check incoming emails for spam" checkbox. -You have a choice for SendGrid's "POST the raw, full MIME message" checkbox. Anymail will handle -either option (and you can change it at any time). Enabling raw MIME will give the most accurate -representation of *any* received email (including complex forms like multi-message mailing list -digests). But disabling it *may* use less memory while processing messages with many large attachments. +In most cases, you should enable SendGrid's "POST the raw, full MIME message" checkbox. +Anymail should work either way (and you can change the option at any time), but enabling +raw MIME will give the most accurate representation of *any* received email (including +complex forms like multi-message mailing list digests). .. _Inbound Parse Webhook: https://sendgrid.com/docs/Classroom/Basics/Inbound_Parse_Webhook/setting_up_the_inbound_parse_webhook.html diff --git a/tests/test_sendgrid_inbound.py b/tests/test_sendgrid_inbound.py index ae41060..f91d1d8 100644 --- a/tests/test_sendgrid_inbound.py +++ b/tests/test_sendgrid_inbound.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + import json from textwrap import dedent @@ -9,7 +11,7 @@ from anymail.inbound import AnymailInboundMessage from anymail.signals import AnymailInboundEvent from anymail.webhooks.sendgrid import SendGridInboundWebhookView -from .utils import sample_image_content, sample_email_content +from .utils import dedent_bytes, sample_image_content, sample_email_content from .webhook_cases import WebhookTestCase @@ -183,3 +185,59 @@ class SendgridInboundTestCase(WebhookTestCase): self.assertEqual(message.subject, 'Raw MIME test') self.assertEqual(message.text, u"It's a body\N{HORIZONTAL ELLIPSIS}\n") self.assertEqual(message.html, u"""
It's a body\N{HORIZONTAL ELLIPSIS}
\n""") + + def test_inbound_charsets(self): + # Captured (sanitized) from actual SendGrid inbound webhook payload 7/2020, + # using a test message constructed with a variety of charsets: + raw_post = dedent_bytes(b"""\ + --xYzZY + Content-Disposition: form-data; name="headers" + + Date: Fri, 24 Jul 2020 16:43:46 UTC + To: =?utf-8?q?R=C3=A9cipiendaire_pr=C3=A9cieux?= + From: =?utf-8?q?Op=C3=A9rateur?= de test + Subject: =?cp850?q?Como_usted_pidi=A2?= + + --xYzZY + Content-Disposition: form-data; name="subject" + + Como usted pidi\xa2 + --xYzZY + Content-Disposition: form-data; name="to" + + R\xc3\xa9cipiendaire pr\xc3\xa9cieux + --xYzZY + Content-Disposition: form-data; name="html" + +

\xbfEsto se ve como esperabas?

+ --xYzZY + Content-Disposition: form-data; name="from" + + Op\xc3\xa9rateur de test + --xYzZY + Content-Disposition: form-data; name="text" + + Test the ESP\x92s inbound charset handling\x85 + --xYzZY + Content-Disposition: form-data; name="charsets" + + {"to":"UTF-8","cc":"UTF-8","html":"iso-8859-1","subject":"cp850","from":"UTF-8","text":"windows-1252"} + --xYzZY-- + """).replace(b"\n", b"\r\n") + + response = self.client.post('/anymail/sendgrid/inbound/', data=raw_post, + content_type="multipart/form-data; boundary=xYzZY") + self.assertEqual(response.status_code, 200) + kwargs = self.assert_handler_called_once_with(self.inbound_handler, sender=SendGridInboundWebhookView, + event=ANY, esp_name='SendGrid') + event = kwargs['event'] + message = event.message + + self.assertEqual(message.from_email.display_name, u"Opérateur de test") + self.assertEqual(message.from_email.addr_spec, "sender@example.com") + self.assertEqual(len(message.to), 1) + self.assertEqual(message.to[0].display_name, u"Récipiendaire précieux") + self.assertEqual(message.to[0].addr_spec, "inbound@sg.example.com") + self.assertEqual(message.subject, u"Como usted pidió") + self.assertEqual(message.text, u"Test the ESP’s inbound charset handling…") + self.assertEqual(message.html, u"

¿Esto se ve como esperabas?

") diff --git a/tests/utils.py b/tests/utils.py index 5aa898e..f0ac31c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -324,3 +324,45 @@ class ClientWithCsrfChecks(Client): def __init__(self, **defaults): super(ClientWithCsrfChecks, self).__init__( enforce_csrf_checks=True, **defaults) + + +# dedent for bytestrs +# https://stackoverflow.com/a/39841195/647002 +_whitespace_only_re = re.compile(b'^[ \t]+$', re.MULTILINE) +_leading_whitespace_re = re.compile(b'(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) + + +def dedent_bytes(text): + """textwrap.dedent, but for bytes""" + # Look for the longest leading string of spaces and tabs common to + # all lines. + margin = None + text = _whitespace_only_re.sub(b'', text) + indents = _leading_whitespace_re.findall(text) + for indent in indents: + if margin is None: + margin = indent + + # Current line more deeply indented than previous winner: + # no change (previous winner is still on top). + elif indent.startswith(margin): + pass + + # Current line consistent with and no deeper than previous winner: + # it's the new winner. + elif margin.startswith(indent): + margin = indent + + # Find the largest common whitespace between current line + # and previous winner. + else: + for i, (x, y) in enumerate(zip(margin, indent)): + if x != y: + margin = margin[:i] + break + else: + margin = margin[:len(indent)] + + if margin: + text = re.sub(b'(?m)^' + margin, b'', text) + return text