Internal: move lengthy compatibility EmailParser into own file

2026-02-05 12:05:21 -05:00 · 2018-04-01 15:24:18 -07:00
parent dbe48d48af
commit 008aef237e
2 changed files with 140 additions and 125 deletions
--- a/anymail/_email_compat.py
+++ b/anymail/_email_compat.py
@@ -0,0 +1,139 @@
 # Work around bugs in older versions of email.parser.Parser
 #
 # This module implements two classes:
 #   EmailParser
 #   EmailBytesParser
 # which can be used like the Python 3.3+ email.parser.Parser
 # and email.parser.BytesParser (with email.policy.default).
 #
 # On Python 2.7, they attempt to work around some bugs/limitations
 # in email.parser.Parser, without trying to back-port the whole
 # Python 3 email package.
 __all__ = ['EmailParser', 'EmailBytesParser']
 from email.parser import Parser
 try:
    # With Python 3.3+ (email6) package, using `policy=email.policy.default`
    # avoids earlier bugs. (Note that Parser defaults to policy=compat32,
    # which *preserves* earlier bugs.)
    from email.policy import default
    from email.parser import BytesParser
    class EmailParser(Parser):
        def __init__(self, _class=None, policy=default):  # don't default to compat32 policy
            super(EmailParser, self).__init__(_class, policy=policy)
    class EmailBytesParser(BytesParser):
        def __init__(self, _class=None, policy=default):  # don't default to compat32 policy
            super(EmailBytesParser, self).__init__(_class, policy=policy)
 except ImportError:
    # Pre-Python 3.3 email package: try to work around some bugs
    from email.header import decode_header
    from collections import deque
    class EmailParser(Parser):
        def parse(self, fp, headersonly=False):
            # Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3).
            # Help it out by pre-unfolding the headers for it.
            fp = HeaderUnfoldingWrapper(fp)
            message = Parser.parse(self, fp, headersonly=headersonly)
            # Older Parser doesn't decode RFC2047 headers, so fix them up here.
            # (Since messsage is fully parsed, can decode headers in all MIME subparts.)
            for part in message.walk():
                part._headers = [  # doesn't seem to be a public API to easily replace all headers
                    (name, _decode_rfc2047(value))
                    for name, value in part._headers]
            return message
    class EmailBytesParser(EmailParser):
        def parsebytes(self, text, headersonly=False):
            # In Python 2, bytes is str, and Parser.parsestr uses bytes-friendly cStringIO.StringIO.
            return self.parsestr(text, headersonly)
    class HeaderUnfoldingWrapper:
        """
        A wrapper for file-like objects passed to email.parser.Parser.parse which works
        around older Parser bugs with folded email headers by pre-unfolding them.
        This only works for headers at the message root, not ones within a MIME subpart.
        (Accurately recognizing subpart headers would require parsing mixed-content boundaries.)
        """
        def __init__(self, fp):
            self.fp = fp
            self._in_headers = True
            self._pushback = deque()
        def _readline(self, limit=-1):
            try:
                line = self._pushback.popleft()
            except IndexError:
                line = self.fp.readline(limit)
                # cStringIO.readline doesn't recognize universal newlines; splitlines does
                lines = line.splitlines(True)
                if len(lines) > 1:
                    line = lines[0]
                    self._pushback.extend(lines[1:])
            return line
        def _peekline(self, limit=-1):
            try:
                line = self._pushback[0]
            except IndexError:
                line = self._readline(limit)
                self._pushback.appendleft(line)
            return line
        def readline(self, limit=-1):
            line = self._readline(limit)
            if self._in_headers:
                line_without_end = line.rstrip("\r\n")  # CRLF, CR, or LF -- "universal newlines"
                if len(line_without_end) == 0:
                    # RFC5322 section 2.1: "The body ... is separated from the header section
                    # by an empty line (i.e., a line with nothing preceding the CRLF)."
                    self._in_headers = False
                else:
                    # Is this header line folded? Need to check next line...
                    # RFC5322 section 2.2.3: "Unfolding is accomplished by simply removing any CRLF
                    # that is immediately followed by WSP." (WSP is space or tab)
                    next_line = self._peekline(limit)
                    if next_line.startswith((' ', '\t')):
                        line = line_without_end
            return line
        def read(self, size):
            if self._in_headers:
                # For simplicity, just read a line at a time while in the header section.
                # (This works because we know email.parser.Parser doesn't really care if it reads
                # more or less data than it asked for -- it just pushes it into FeedParser either way.)
                return self.readline(size)
            elif len(self._pushback):
                buf = ''.join(self._pushback)
                self._pushback.clear()
                return buf
            else:
                return self.fp.read(size)
    def _decode_rfc2047(value):
        result = value
        decoded_segments = decode_header(value)
        if any(charset is not None for raw, charset in decoded_segments):
            # At least one segment is an RFC2047 encoded-word.
            # Reassemble the segments into a single decoded string.
            unicode_segments = []
            prev_charset = None
            for raw, charset in decoded_segments:
                if (charset is None or prev_charset is None) and unicode_segments:
                    # Transitioning to, from, or between *non*-encoded segments:
                    # add back inter-segment whitespace that decode_header consumed
                    unicode_segments.append(u" ")
                decoded = raw.decode(charset, 'replace') if charset is not None else raw
                unicode_segments.append(decoded)
                prev_charset = charset
            result = u"".join(unicode_segments)
        return result
--- a/anymail/inbound.py
+++ b/anymail/inbound.py
@@ -1,137 +1,13 @@
 from base64 import b64decode
 from email.message import Message
 from email.parser import Parser
 from email.utils import unquote
 import six
 from django.core.files.uploadedfile import SimpleUploadedFile
 from ._email_compat import EmailParser, EmailBytesParser
 from .utils import angle_wrap, get_content_disposition, parse_address_list, parse_rfc2822date
 # Work around bugs in older versions of email.parser.Parser
 try:
    # With Python 3.3+ (email6) package, using `policy=email.policy.default`
    # avoids earlier bugs. (Note that Parser defaults to policy=compat32,
    # which *preserves* earlier bugs.)
    from email.policy import default
    from email.parser import BytesParser
    class EmailParser(Parser):
        def __init__(self, _class=None, policy=default):  # don't default to compat32 policy
            super(EmailParser, self).__init__(_class, policy=policy)
    class EmailBytesParser(BytesParser):
        def __init__(self, _class=None, policy=default):  # don't default to compat32 policy
            super(EmailBytesParser, self).__init__(_class, policy=policy)
 except ImportError:
    # Pre-Python 3.3 email package: try to work around some bugs
    from email.header import decode_header
    from collections import deque
    class EmailParser(Parser):
        def parse(self, fp, headersonly=False):
            # Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3).
            # Help it out by pre-unfolding the headers for it.
            fp = HeaderUnfoldingWrapper(fp)
            message = Parser.parse(self, fp, headersonly=headersonly)
            # Older Parser doesn't decode RFC2047 headers, so fix them up here.
            # (Since messsage is fully parsed, can decode headers in all MIME subparts.)
            for part in message.walk():
                part._headers = [  # doesn't seem to be a public API to easily replace all headers
                    (name, _decode_rfc2047(value))
                    for name, value in part._headers]
            return message
    class EmailBytesParser(EmailParser):
        def parsebytes(self, text, headersonly=False):
            # In Python 2, bytes is str, and Parser.parsestr uses bytes-friendly cStringIO.StringIO.
            return self.parsestr(text, headersonly)
    class HeaderUnfoldingWrapper:
        """
        A wrapper for file-like objects passed to email.parser.Parser.parse which works
        around older Parser bugs with folded email headers by pre-unfolding them.
        This only works for headers at the message root, not ones within a MIME subpart.
        (Accurately recognizing subpart headers would require parsing mixed-content boundaries.)
        """
        def __init__(self, fp):
            self.fp = fp
            self._in_headers = True
            self._pushback = deque()
        def _readline(self, limit=-1):
            try:
                line = self._pushback.popleft()
            except IndexError:
                line = self.fp.readline(limit)
                # cStringIO.readline doesn't recognize universal newlines; splitlines does
                lines = line.splitlines(True)
                if len(lines) > 1:
                    line = lines[0]
                    self._pushback.extend(lines[1:])
            return line
        def _peekline(self, limit=-1):
            try:
                line = self._pushback[0]
            except IndexError:
                line = self._readline(limit)
                self._pushback.appendleft(line)
            return line
        def readline(self, limit=-1):
            line = self._readline(limit)
            if self._in_headers:
                line_without_end = line.rstrip("\r\n")  # CRLF, CR, or LF -- "universal newlines"
                if len(line_without_end) == 0:
                    # RFC5322 section 2.1: "The body ... is separated from the header section
                    # by an empty line (i.e., a line with nothing preceding the CRLF)."
                    self._in_headers = False
                else:
                    # Is this header line folded? Need to check next line...
                    # RFC5322 section 2.2.3: "Unfolding is accomplished by simply removing any CRLF
                    # that is immediately followed by WSP." (WSP is space or tab)
                    next_line = self._peekline(limit)
                    if next_line.startswith((' ', '\t')):
                        line = line_without_end
            return line
        def read(self, size):
            if self._in_headers:
                # For simplicity, just read a line at a time while in the header section.
                # (This works because we know email.parser.Parser doesn't really care if it reads
                # more or less data than it asked for -- it just pushes it into FeedParser either way.)
                return self.readline(size)
            elif len(self._pushback):
                buf = ''.join(self._pushback)
                self._pushback.clear()
                return buf
            else:
                return self.fp.read(size)
    def _decode_rfc2047(value):
        result = value
        decoded_segments = decode_header(value)
        if any(charset is not None for raw, charset in decoded_segments):
            # At least one segment is an RFC2047 encoded-word.
            # Reassemble the segments into a single decoded string.
            unicode_segments = []
            prev_charset = None
            for raw, charset in decoded_segments:
                if (charset is None or prev_charset is None) and unicode_segments:
                    # Transitioning to, from, or between *non*-encoded segments:
                    # add back inter-segment whitespace that decode_header consumed
                    unicode_segments.append(u" ")
                decoded = raw.decode(charset, 'replace') if charset is not None else raw
                unicode_segments.append(decoded)
                prev_charset = charset
            result = u"".join(unicode_segments)
        return result
 class AnymailInboundMessage(Message, object):  # `object` ensures new-style class in Python 2)
    """