SendGrid: fix inbound webhook Unicode error when not utf-8

Fix a crash or text-mangling issue when an inbound message uses a charset other than utf-8 for its text or html body, and SendGrid's "post raw" inbound parse option is *not* enabled. Update docs to recommend "post raw" option. Fixes #187
2026-02-05 12:05:21 -05:00 · 2020-07-24 17:32:45 -07:00
parent c4ed6660b3
commit 743d3ce21f
5 changed files with 143 additions and 11 deletions
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -42,6 +42,11 @@ Fixes
 * **Mailjet:** Fix TypeError when sending to or from addresses with display names
  containing commas (introduced in Django 2.2.15, 3.0.9, and 3.1).
 * **SendGrid:** Fix UnicodeError in inbound webhook, when receiving message using
  charsets other than utf-8, and *not* using SendGrid's "post raw" inbound parse
  option. Also update docs to recommend "post raw" with SendGrid inbound. (Thanks to
  `@tcourtqtm`_ for reporting the issue.)
 Features
 ~~~~~~~~
@@ -1104,6 +1109,7 @@ Features
 .. _@sebbacon: https://github.com/sebbacon
 .. _@swrobel: https://github.com/swrobel
 .. _@Thorbenl: https://github.com/Thorbenl
 .. _@tcourtqtm: https://github.com/tcourtqtm
 .. _@varche1: https://github.com/varche1
 .. _@vgrebenschikov: https://github.com/vgrebenschikov
 .. _@yourcelf: https://github.com/yourcelf
--- a/anymail/webhooks/sendgrid.py
+++ b/anymail/webhooks/sendgrid.py
@@ -4,6 +4,7 @@ from datetime import datetime
 from django.utils.timezone import utc
 from .base import AnymailBaseWebhookView
 from .._email_compat import EmailBytesParser
 from ..inbound import AnymailInboundMessage
 from ..signals import inbound, tracking, AnymailInboundEvent, AnymailTrackingEvent, EventType, RejectReason
@@ -131,6 +132,9 @@ class SendGridInboundWebhookView(AnymailBaseWebhookView):
        # Inbound uses the entire Django request as esp_event, because we need POST and FILES.
        # Note that request.POST is case-sensitive (unlike email.message.Message headers).
        esp_event = request
        # Must access body before any POST fields, or it won't be available if we need
        # it later (see text_charset and html_charset handling below).
        _ensure_body_is_available_later = request.body  # noqa: F841
        if 'headers' in request.POST:
            # Default (not "Send Raw") inbound fields
            message = self.message_from_sendgrid_parsed(esp_event)
@@ -183,11 +187,33 @@ class SendGridInboundWebhookView(AnymailBaseWebhookView):
                for att_id in sorted(attachment_info.keys())
            ]
        default_charset = request.POST.encoding.lower()  # (probably utf-8)
        text = request.POST.get('text')
        text_charset = charsets.get('text', default_charset).lower()
        html = request.POST.get('html')
        html_charset = charsets.get('html', default_charset).lower()
        if (text and text_charset != default_charset) or (html and html_charset != default_charset):
            # Django has parsed text and/or html fields using the wrong charset.
            # We need to re-parse the raw form data and decode each field separately,
            # using the indicated charsets. The email package parses multipart/form-data
            # retaining bytes content. (In theory, we could instead just change
            # request.encoding and access the POST fields again, per Django docs,
            # but that seems to be have bugs around the cached request._files.)
            raw_data = b"".join([
                b"Content-Type: ", request.META['CONTENT_TYPE'].encode('ascii'),
                b"\r\n\r\n",
                request.body
            ])
            parsed_parts = EmailBytesParser().parsebytes(raw_data).get_payload()
            for part in parsed_parts:
                name = part.get_param('name', header='content-disposition')
                if name == 'text':
                    text = part.get_payload(decode=True).decode(text_charset)
                elif name == 'html':
                    html = part.get_payload(decode=True).decode(html_charset)
                # (subject, from, to, etc. are parsed from raw headers field,
                # so no need to worry about their separate POST field charsets)
        return AnymailInboundMessage.construct(
            raw_headers=request.POST.get('headers', ""),  # includes From, To, Cc, Subject, etc.
-            text=request.POST.get('text', None),
+            text=text, html=html, attachments=attachments)
            text_charset=charsets.get('text', 'utf-8'),
            html=request.POST.get('html', None),
            html_charset=charsets.get('html', 'utf-8'),
            attachments=attachments,
        )
--- a/docs/esps/sendgrid.rst
+++ b/docs/esps/sendgrid.rst
@@ -426,10 +426,10 @@ If you want to use Anymail's normalized :attr:`~anymail.inbound.AnymailInboundMe
 :attr:`~anymail.inbound.AnymailInboundMessage.spam_score` attributes, be sure to enable the "Check
 incoming emails for spam" checkbox.
-You have a choice for SendGrid's "POST the raw, full MIME message" checkbox. Anymail will handle
+In most cases, you should enable SendGrid's "POST the raw, full MIME message" checkbox.
-either option (and you can change it at any time). Enabling raw MIME will give the most accurate
+Anymail should work either way (and you can change the option at any time), but enabling
-representation of *any* received email (including complex forms like multi-message mailing list
+raw MIME will give the most accurate representation of *any* received email (including
-digests). But disabling it *may* use less memory while processing messages with many large attachments.
+complex forms like multi-message mailing list digests).
 .. _Inbound Parse Webhook:
   https://sendgrid.com/docs/Classroom/Basics/Inbound_Parse_Webhook/setting_up_the_inbound_parse_webhook.html
--- a/tests/test_sendgrid_inbound.py
+++ b/tests/test_sendgrid_inbound.py
@@ -1,3 +1,5 @@
 # -*- coding: utf-8 -*-
 import json
 from textwrap import dedent
@@ -9,7 +11,7 @@ from anymail.inbound import AnymailInboundMessage
 from anymail.signals import AnymailInboundEvent
 from anymail.webhooks.sendgrid import SendGridInboundWebhookView
-from .utils import sample_image_content, sample_email_content
+from .utils import dedent_bytes, sample_image_content, sample_email_content
 from .webhook_cases import WebhookTestCase
@@ -183,3 +185,59 @@ class SendgridInboundTestCase(WebhookTestCase):
        self.assertEqual(message.subject, 'Raw MIME test')
        self.assertEqual(message.text, u"It's a body\N{HORIZONTAL ELLIPSIS}\n")
        self.assertEqual(message.html, u"""<div dir="ltr">It's a body\N{HORIZONTAL ELLIPSIS}</div>\n""")
    def test_inbound_charsets(self):
        # Captured (sanitized) from actual SendGrid inbound webhook payload 7/2020,
        # using a test message constructed with a variety of charsets:
        raw_post = dedent_bytes(b"""\
            --xYzZY
            Content-Disposition: form-data; name="headers"
            Date: Fri, 24 Jul 2020 16:43:46 UTC
            To: =?utf-8?q?R=C3=A9cipiendaire_pr=C3=A9cieux?= <inbound@sg.example.com>
            From: =?utf-8?q?Op=C3=A9rateur?= de test <sender@example.com>
            Subject: =?cp850?q?Como_usted_pidi=A2?=
            --xYzZY
            Content-Disposition: form-data; name="subject"
            Como usted pidi\xa2
            --xYzZY
            Content-Disposition: form-data; name="to"
            R\xc3\xa9cipiendaire pr\xc3\xa9cieux <inbound@sg.example.com>
            --xYzZY
            Content-Disposition: form-data; name="html"
            <p>\xbfEsto se ve como esperabas?</p>
            --xYzZY
            Content-Disposition: form-data; name="from"
            Op\xc3\xa9rateur de test <sender@example.com>
            --xYzZY
            Content-Disposition: form-data; name="text"
            Test the ESP\x92s inbound charset handling\x85
            --xYzZY
            Content-Disposition: form-data; name="charsets"
            {"to":"UTF-8","cc":"UTF-8","html":"iso-8859-1","subject":"cp850","from":"UTF-8","text":"windows-1252"}
            --xYzZY--
            """).replace(b"\n", b"\r\n")
        response = self.client.post('/anymail/sendgrid/inbound/', data=raw_post,
                                    content_type="multipart/form-data; boundary=xYzZY")
        self.assertEqual(response.status_code, 200)
        kwargs = self.assert_handler_called_once_with(self.inbound_handler, sender=SendGridInboundWebhookView,
                                                      event=ANY, esp_name='SendGrid')
        event = kwargs['event']
        message = event.message
        self.assertEqual(message.from_email.display_name, u"Opérateur de test")
        self.assertEqual(message.from_email.addr_spec, "sender@example.com")
        self.assertEqual(len(message.to), 1)
        self.assertEqual(message.to[0].display_name, u"Récipiendaire précieux")
        self.assertEqual(message.to[0].addr_spec, "inbound@sg.example.com")
        self.assertEqual(message.subject, u"Como usted pidió")
        self.assertEqual(message.text, u"Test the ESP’s inbound charset handling…")
        self.assertEqual(message.html, u"<p>¿Esto se ve como esperabas?</p>")
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -324,3 +324,45 @@ class ClientWithCsrfChecks(Client):
    def __init__(self, **defaults):
        super(ClientWithCsrfChecks, self).__init__(
            enforce_csrf_checks=True, **defaults)
 # dedent for bytestrs
 # https://stackoverflow.com/a/39841195/647002
 _whitespace_only_re = re.compile(b'^[ \t]+$', re.MULTILINE)
 _leading_whitespace_re = re.compile(b'(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
 def dedent_bytes(text):
    """textwrap.dedent, but for bytes"""
    # Look for the longest leading string of spaces and tabs common to
    # all lines.
    margin = None
    text = _whitespace_only_re.sub(b'', text)
    indents = _leading_whitespace_re.findall(text)
    for indent in indents:
        if margin is None:
            margin = indent
        # Current line more deeply indented than previous winner:
        # no change (previous winner is still on top).
        elif indent.startswith(margin):
            pass
        # Current line consistent with and no deeper than previous winner:
        # it's the new winner.
        elif margin.startswith(indent):
            margin = indent
        # Find the largest common whitespace between current line
        # and previous winner.
        else:
            for i, (x, y) in enumerate(zip(margin, indent)):
                if x != y:
                    margin = margin[:i]
                    break
            else:
                margin = margin[:len(indent)]
    if margin:
        text = re.sub(b'(?m)^' + margin, b'', text)
    return text