From 008aef237e12f69203f0c2d14d342826a4090348 Mon Sep 17 00:00:00 2001 From: medmunds Date: Sun, 1 Apr 2018 15:24:18 -0700 Subject: [PATCH] Internal: move lengthy compatibility EmailParser into own file --- anymail/_email_compat.py | 139 +++++++++++++++++++++++++++++++++++++++ anymail/inbound.py | 126 +---------------------------------- 2 files changed, 140 insertions(+), 125 deletions(-) create mode 100644 anymail/_email_compat.py diff --git a/anymail/_email_compat.py b/anymail/_email_compat.py new file mode 100644 index 0000000..a760a38 --- /dev/null +++ b/anymail/_email_compat.py @@ -0,0 +1,139 @@ +# Work around bugs in older versions of email.parser.Parser +# +# This module implements two classes: +# EmailParser +# EmailBytesParser +# which can be used like the Python 3.3+ email.parser.Parser +# and email.parser.BytesParser (with email.policy.default). +# +# On Python 2.7, they attempt to work around some bugs/limitations +# in email.parser.Parser, without trying to back-port the whole +# Python 3 email package. + +__all__ = ['EmailParser', 'EmailBytesParser'] + + +from email.parser import Parser + +try: + # With Python 3.3+ (email6) package, using `policy=email.policy.default` + # avoids earlier bugs. (Note that Parser defaults to policy=compat32, + # which *preserves* earlier bugs.) + from email.policy import default + from email.parser import BytesParser + + class EmailParser(Parser): + def __init__(self, _class=None, policy=default): # don't default to compat32 policy + super(EmailParser, self).__init__(_class, policy=policy) + + class EmailBytesParser(BytesParser): + def __init__(self, _class=None, policy=default): # don't default to compat32 policy + super(EmailBytesParser, self).__init__(_class, policy=policy) + +except ImportError: + # Pre-Python 3.3 email package: try to work around some bugs + from email.header import decode_header + from collections import deque + + class EmailParser(Parser): + def parse(self, fp, headersonly=False): + # Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3). + # Help it out by pre-unfolding the headers for it. + fp = HeaderUnfoldingWrapper(fp) + message = Parser.parse(self, fp, headersonly=headersonly) + + # Older Parser doesn't decode RFC2047 headers, so fix them up here. + # (Since messsage is fully parsed, can decode headers in all MIME subparts.) + for part in message.walk(): + part._headers = [ # doesn't seem to be a public API to easily replace all headers + (name, _decode_rfc2047(value)) + for name, value in part._headers] + return message + + class EmailBytesParser(EmailParser): + def parsebytes(self, text, headersonly=False): + # In Python 2, bytes is str, and Parser.parsestr uses bytes-friendly cStringIO.StringIO. + return self.parsestr(text, headersonly) + + class HeaderUnfoldingWrapper: + """ + A wrapper for file-like objects passed to email.parser.Parser.parse which works + around older Parser bugs with folded email headers by pre-unfolding them. + + This only works for headers at the message root, not ones within a MIME subpart. + (Accurately recognizing subpart headers would require parsing mixed-content boundaries.) + """ + + def __init__(self, fp): + self.fp = fp + self._in_headers = True + self._pushback = deque() + + def _readline(self, limit=-1): + try: + line = self._pushback.popleft() + except IndexError: + line = self.fp.readline(limit) + # cStringIO.readline doesn't recognize universal newlines; splitlines does + lines = line.splitlines(True) + if len(lines) > 1: + line = lines[0] + self._pushback.extend(lines[1:]) + return line + + def _peekline(self, limit=-1): + try: + line = self._pushback[0] + except IndexError: + line = self._readline(limit) + self._pushback.appendleft(line) + return line + + def readline(self, limit=-1): + line = self._readline(limit) + if self._in_headers: + line_without_end = line.rstrip("\r\n") # CRLF, CR, or LF -- "universal newlines" + if len(line_without_end) == 0: + # RFC5322 section 2.1: "The body ... is separated from the header section + # by an empty line (i.e., a line with nothing preceding the CRLF)." + self._in_headers = False + else: + # Is this header line folded? Need to check next line... + # RFC5322 section 2.2.3: "Unfolding is accomplished by simply removing any CRLF + # that is immediately followed by WSP." (WSP is space or tab) + next_line = self._peekline(limit) + if next_line.startswith((' ', '\t')): + line = line_without_end + return line + + def read(self, size): + if self._in_headers: + # For simplicity, just read a line at a time while in the header section. + # (This works because we know email.parser.Parser doesn't really care if it reads + # more or less data than it asked for -- it just pushes it into FeedParser either way.) + return self.readline(size) + elif len(self._pushback): + buf = ''.join(self._pushback) + self._pushback.clear() + return buf + else: + return self.fp.read(size) + + def _decode_rfc2047(value): + result = value + decoded_segments = decode_header(value) + if any(charset is not None for raw, charset in decoded_segments): + # At least one segment is an RFC2047 encoded-word. + # Reassemble the segments into a single decoded string. + unicode_segments = [] + prev_charset = None + for raw, charset in decoded_segments: + if (charset is None or prev_charset is None) and unicode_segments: + # Transitioning to, from, or between *non*-encoded segments: + # add back inter-segment whitespace that decode_header consumed + unicode_segments.append(u" ") + decoded = raw.decode(charset, 'replace') if charset is not None else raw + unicode_segments.append(decoded) + prev_charset = charset + result = u"".join(unicode_segments) + return result diff --git a/anymail/inbound.py b/anymail/inbound.py index 5da235f..ec65215 100644 --- a/anymail/inbound.py +++ b/anymail/inbound.py @@ -1,137 +1,13 @@ from base64 import b64decode from email.message import Message -from email.parser import Parser from email.utils import unquote import six from django.core.files.uploadedfile import SimpleUploadedFile +from ._email_compat import EmailParser, EmailBytesParser from .utils import angle_wrap, get_content_disposition, parse_address_list, parse_rfc2822date -# Work around bugs in older versions of email.parser.Parser -try: - # With Python 3.3+ (email6) package, using `policy=email.policy.default` - # avoids earlier bugs. (Note that Parser defaults to policy=compat32, - # which *preserves* earlier bugs.) - from email.policy import default - from email.parser import BytesParser - - class EmailParser(Parser): - def __init__(self, _class=None, policy=default): # don't default to compat32 policy - super(EmailParser, self).__init__(_class, policy=policy) - - class EmailBytesParser(BytesParser): - def __init__(self, _class=None, policy=default): # don't default to compat32 policy - super(EmailBytesParser, self).__init__(_class, policy=policy) - -except ImportError: - # Pre-Python 3.3 email package: try to work around some bugs - from email.header import decode_header - from collections import deque - - class EmailParser(Parser): - def parse(self, fp, headersonly=False): - # Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3). - # Help it out by pre-unfolding the headers for it. - fp = HeaderUnfoldingWrapper(fp) - message = Parser.parse(self, fp, headersonly=headersonly) - - # Older Parser doesn't decode RFC2047 headers, so fix them up here. - # (Since messsage is fully parsed, can decode headers in all MIME subparts.) - for part in message.walk(): - part._headers = [ # doesn't seem to be a public API to easily replace all headers - (name, _decode_rfc2047(value)) - for name, value in part._headers] - return message - - class EmailBytesParser(EmailParser): - def parsebytes(self, text, headersonly=False): - # In Python 2, bytes is str, and Parser.parsestr uses bytes-friendly cStringIO.StringIO. - return self.parsestr(text, headersonly) - - class HeaderUnfoldingWrapper: - """ - A wrapper for file-like objects passed to email.parser.Parser.parse which works - around older Parser bugs with folded email headers by pre-unfolding them. - - This only works for headers at the message root, not ones within a MIME subpart. - (Accurately recognizing subpart headers would require parsing mixed-content boundaries.) - """ - - def __init__(self, fp): - self.fp = fp - self._in_headers = True - self._pushback = deque() - - def _readline(self, limit=-1): - try: - line = self._pushback.popleft() - except IndexError: - line = self.fp.readline(limit) - # cStringIO.readline doesn't recognize universal newlines; splitlines does - lines = line.splitlines(True) - if len(lines) > 1: - line = lines[0] - self._pushback.extend(lines[1:]) - return line - - def _peekline(self, limit=-1): - try: - line = self._pushback[0] - except IndexError: - line = self._readline(limit) - self._pushback.appendleft(line) - return line - - def readline(self, limit=-1): - line = self._readline(limit) - if self._in_headers: - line_without_end = line.rstrip("\r\n") # CRLF, CR, or LF -- "universal newlines" - if len(line_without_end) == 0: - # RFC5322 section 2.1: "The body ... is separated from the header section - # by an empty line (i.e., a line with nothing preceding the CRLF)." - self._in_headers = False - else: - # Is this header line folded? Need to check next line... - # RFC5322 section 2.2.3: "Unfolding is accomplished by simply removing any CRLF - # that is immediately followed by WSP." (WSP is space or tab) - next_line = self._peekline(limit) - if next_line.startswith((' ', '\t')): - line = line_without_end - return line - - def read(self, size): - if self._in_headers: - # For simplicity, just read a line at a time while in the header section. - # (This works because we know email.parser.Parser doesn't really care if it reads - # more or less data than it asked for -- it just pushes it into FeedParser either way.) - return self.readline(size) - elif len(self._pushback): - buf = ''.join(self._pushback) - self._pushback.clear() - return buf - else: - return self.fp.read(size) - - def _decode_rfc2047(value): - result = value - decoded_segments = decode_header(value) - if any(charset is not None for raw, charset in decoded_segments): - # At least one segment is an RFC2047 encoded-word. - # Reassemble the segments into a single decoded string. - unicode_segments = [] - prev_charset = None - for raw, charset in decoded_segments: - if (charset is None or prev_charset is None) and unicode_segments: - # Transitioning to, from, or between *non*-encoded segments: - # add back inter-segment whitespace that decode_header consumed - unicode_segments.append(u" ") - decoded = raw.decode(charset, 'replace') if charset is not None else raw - unicode_segments.append(decoded) - prev_charset = charset - result = u"".join(unicode_segments) - return result - class AnymailInboundMessage(Message, object): # `object` ensures new-style class in Python 2) """