SendGrid: fix inbound webhook Unicode error when not utf-8

Fix a crash or text-mangling issue when an inbound message
uses a charset other than utf-8 for its text or html body,
and SendGrid's "post raw" inbound parse option is *not*
enabled.

Update docs to recommend "post raw" option.

Fixes #187
This commit is contained in:
Mike Edmunds
2020-07-24 17:32:45 -07:00
committed by GitHub
parent c4ed6660b3
commit 743d3ce21f
5 changed files with 143 additions and 11 deletions

View File

@@ -42,6 +42,11 @@ Fixes
* **Mailjet:** Fix TypeError when sending to or from addresses with display names * **Mailjet:** Fix TypeError when sending to or from addresses with display names
containing commas (introduced in Django 2.2.15, 3.0.9, and 3.1). containing commas (introduced in Django 2.2.15, 3.0.9, and 3.1).
* **SendGrid:** Fix UnicodeError in inbound webhook, when receiving message using
charsets other than utf-8, and *not* using SendGrid's "post raw" inbound parse
option. Also update docs to recommend "post raw" with SendGrid inbound. (Thanks to
`@tcourtqtm`_ for reporting the issue.)
Features Features
~~~~~~~~ ~~~~~~~~
@@ -1104,6 +1109,7 @@ Features
.. _@sebbacon: https://github.com/sebbacon .. _@sebbacon: https://github.com/sebbacon
.. _@swrobel: https://github.com/swrobel .. _@swrobel: https://github.com/swrobel
.. _@Thorbenl: https://github.com/Thorbenl .. _@Thorbenl: https://github.com/Thorbenl
.. _@tcourtqtm: https://github.com/tcourtqtm
.. _@varche1: https://github.com/varche1 .. _@varche1: https://github.com/varche1
.. _@vgrebenschikov: https://github.com/vgrebenschikov .. _@vgrebenschikov: https://github.com/vgrebenschikov
.. _@yourcelf: https://github.com/yourcelf .. _@yourcelf: https://github.com/yourcelf

View File

@@ -4,6 +4,7 @@ from datetime import datetime
from django.utils.timezone import utc from django.utils.timezone import utc
from .base import AnymailBaseWebhookView from .base import AnymailBaseWebhookView
from .._email_compat import EmailBytesParser
from ..inbound import AnymailInboundMessage from ..inbound import AnymailInboundMessage
from ..signals import inbound, tracking, AnymailInboundEvent, AnymailTrackingEvent, EventType, RejectReason from ..signals import inbound, tracking, AnymailInboundEvent, AnymailTrackingEvent, EventType, RejectReason
@@ -131,6 +132,9 @@ class SendGridInboundWebhookView(AnymailBaseWebhookView):
# Inbound uses the entire Django request as esp_event, because we need POST and FILES. # Inbound uses the entire Django request as esp_event, because we need POST and FILES.
# Note that request.POST is case-sensitive (unlike email.message.Message headers). # Note that request.POST is case-sensitive (unlike email.message.Message headers).
esp_event = request esp_event = request
# Must access body before any POST fields, or it won't be available if we need
# it later (see text_charset and html_charset handling below).
_ensure_body_is_available_later = request.body # noqa: F841
if 'headers' in request.POST: if 'headers' in request.POST:
# Default (not "Send Raw") inbound fields # Default (not "Send Raw") inbound fields
message = self.message_from_sendgrid_parsed(esp_event) message = self.message_from_sendgrid_parsed(esp_event)
@@ -183,11 +187,33 @@ class SendGridInboundWebhookView(AnymailBaseWebhookView):
for att_id in sorted(attachment_info.keys()) for att_id in sorted(attachment_info.keys())
] ]
default_charset = request.POST.encoding.lower() # (probably utf-8)
text = request.POST.get('text')
text_charset = charsets.get('text', default_charset).lower()
html = request.POST.get('html')
html_charset = charsets.get('html', default_charset).lower()
if (text and text_charset != default_charset) or (html and html_charset != default_charset):
# Django has parsed text and/or html fields using the wrong charset.
# We need to re-parse the raw form data and decode each field separately,
# using the indicated charsets. The email package parses multipart/form-data
# retaining bytes content. (In theory, we could instead just change
# request.encoding and access the POST fields again, per Django docs,
# but that seems to be have bugs around the cached request._files.)
raw_data = b"".join([
b"Content-Type: ", request.META['CONTENT_TYPE'].encode('ascii'),
b"\r\n\r\n",
request.body
])
parsed_parts = EmailBytesParser().parsebytes(raw_data).get_payload()
for part in parsed_parts:
name = part.get_param('name', header='content-disposition')
if name == 'text':
text = part.get_payload(decode=True).decode(text_charset)
elif name == 'html':
html = part.get_payload(decode=True).decode(html_charset)
# (subject, from, to, etc. are parsed from raw headers field,
# so no need to worry about their separate POST field charsets)
return AnymailInboundMessage.construct( return AnymailInboundMessage.construct(
raw_headers=request.POST.get('headers', ""), # includes From, To, Cc, Subject, etc. raw_headers=request.POST.get('headers', ""), # includes From, To, Cc, Subject, etc.
text=request.POST.get('text', None), text=text, html=html, attachments=attachments)
text_charset=charsets.get('text', 'utf-8'),
html=request.POST.get('html', None),
html_charset=charsets.get('html', 'utf-8'),
attachments=attachments,
)

View File

@@ -426,10 +426,10 @@ If you want to use Anymail's normalized :attr:`~anymail.inbound.AnymailInboundMe
:attr:`~anymail.inbound.AnymailInboundMessage.spam_score` attributes, be sure to enable the "Check :attr:`~anymail.inbound.AnymailInboundMessage.spam_score` attributes, be sure to enable the "Check
incoming emails for spam" checkbox. incoming emails for spam" checkbox.
You have a choice for SendGrid's "POST the raw, full MIME message" checkbox. Anymail will handle In most cases, you should enable SendGrid's "POST the raw, full MIME message" checkbox.
either option (and you can change it at any time). Enabling raw MIME will give the most accurate Anymail should work either way (and you can change the option at any time), but enabling
representation of *any* received email (including complex forms like multi-message mailing list raw MIME will give the most accurate representation of *any* received email (including
digests). But disabling it *may* use less memory while processing messages with many large attachments. complex forms like multi-message mailing list digests).
.. _Inbound Parse Webhook: .. _Inbound Parse Webhook:
https://sendgrid.com/docs/Classroom/Basics/Inbound_Parse_Webhook/setting_up_the_inbound_parse_webhook.html https://sendgrid.com/docs/Classroom/Basics/Inbound_Parse_Webhook/setting_up_the_inbound_parse_webhook.html

View File

@@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import json import json
from textwrap import dedent from textwrap import dedent
@@ -9,7 +11,7 @@ from anymail.inbound import AnymailInboundMessage
from anymail.signals import AnymailInboundEvent from anymail.signals import AnymailInboundEvent
from anymail.webhooks.sendgrid import SendGridInboundWebhookView from anymail.webhooks.sendgrid import SendGridInboundWebhookView
from .utils import sample_image_content, sample_email_content from .utils import dedent_bytes, sample_image_content, sample_email_content
from .webhook_cases import WebhookTestCase from .webhook_cases import WebhookTestCase
@@ -183,3 +185,59 @@ class SendgridInboundTestCase(WebhookTestCase):
self.assertEqual(message.subject, 'Raw MIME test') self.assertEqual(message.subject, 'Raw MIME test')
self.assertEqual(message.text, u"It's a body\N{HORIZONTAL ELLIPSIS}\n") self.assertEqual(message.text, u"It's a body\N{HORIZONTAL ELLIPSIS}\n")
self.assertEqual(message.html, u"""<div dir="ltr">It's a body\N{HORIZONTAL ELLIPSIS}</div>\n""") self.assertEqual(message.html, u"""<div dir="ltr">It's a body\N{HORIZONTAL ELLIPSIS}</div>\n""")
def test_inbound_charsets(self):
# Captured (sanitized) from actual SendGrid inbound webhook payload 7/2020,
# using a test message constructed with a variety of charsets:
raw_post = dedent_bytes(b"""\
--xYzZY
Content-Disposition: form-data; name="headers"
Date: Fri, 24 Jul 2020 16:43:46 UTC
To: =?utf-8?q?R=C3=A9cipiendaire_pr=C3=A9cieux?= <inbound@sg.example.com>
From: =?utf-8?q?Op=C3=A9rateur?= de test <sender@example.com>
Subject: =?cp850?q?Como_usted_pidi=A2?=
--xYzZY
Content-Disposition: form-data; name="subject"
Como usted pidi\xa2
--xYzZY
Content-Disposition: form-data; name="to"
R\xc3\xa9cipiendaire pr\xc3\xa9cieux <inbound@sg.example.com>
--xYzZY
Content-Disposition: form-data; name="html"
<p>\xbfEsto se ve como esperabas?</p>
--xYzZY
Content-Disposition: form-data; name="from"
Op\xc3\xa9rateur de test <sender@example.com>
--xYzZY
Content-Disposition: form-data; name="text"
Test the ESP\x92s inbound charset handling\x85
--xYzZY
Content-Disposition: form-data; name="charsets"
{"to":"UTF-8","cc":"UTF-8","html":"iso-8859-1","subject":"cp850","from":"UTF-8","text":"windows-1252"}
--xYzZY--
""").replace(b"\n", b"\r\n")
response = self.client.post('/anymail/sendgrid/inbound/', data=raw_post,
content_type="multipart/form-data; boundary=xYzZY")
self.assertEqual(response.status_code, 200)
kwargs = self.assert_handler_called_once_with(self.inbound_handler, sender=SendGridInboundWebhookView,
event=ANY, esp_name='SendGrid')
event = kwargs['event']
message = event.message
self.assertEqual(message.from_email.display_name, u"Opérateur de test")
self.assertEqual(message.from_email.addr_spec, "sender@example.com")
self.assertEqual(len(message.to), 1)
self.assertEqual(message.to[0].display_name, u"Récipiendaire précieux")
self.assertEqual(message.to[0].addr_spec, "inbound@sg.example.com")
self.assertEqual(message.subject, u"Como usted pidió")
self.assertEqual(message.text, u"Test the ESPs inbound charset handling…")
self.assertEqual(message.html, u"<p>¿Esto se ve como esperabas?</p>")

View File

@@ -324,3 +324,45 @@ class ClientWithCsrfChecks(Client):
def __init__(self, **defaults): def __init__(self, **defaults):
super(ClientWithCsrfChecks, self).__init__( super(ClientWithCsrfChecks, self).__init__(
enforce_csrf_checks=True, **defaults) enforce_csrf_checks=True, **defaults)
# dedent for bytestrs
# https://stackoverflow.com/a/39841195/647002
_whitespace_only_re = re.compile(b'^[ \t]+$', re.MULTILINE)
_leading_whitespace_re = re.compile(b'(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
def dedent_bytes(text):
"""textwrap.dedent, but for bytes"""
# Look for the longest leading string of spaces and tabs common to
# all lines.
margin = None
text = _whitespace_only_re.sub(b'', text)
indents = _leading_whitespace_re.findall(text)
for indent in indents:
if margin is None:
margin = indent
# Current line more deeply indented than previous winner:
# no change (previous winner is still on top).
elif indent.startswith(margin):
pass
# Current line consistent with and no deeper than previous winner:
# it's the new winner.
elif margin.startswith(indent):
margin = indent
# Find the largest common whitespace between current line
# and previous winner.
else:
for i, (x, y) in enumerate(zip(margin, indent)):
if x != y:
margin = margin[:i]
break
else:
margin = margin[:len(indent)]
if margin:
text = re.sub(b'(?m)^' + margin, b'', text)
return text