From 008aef237e12f69203f0c2d14d342826a4090348 Mon Sep 17 00:00:00 2001
From: medmunds <medmunds@gmail.com>
Date: Sun, 1 Apr 2018 15:24:18 -0700
Subject: [PATCH] Internal: move lengthy compatibility EmailParser into own
 file

---
 anymail/_email_compat.py | 139 +++++++++++++++++++++++++++++++++++++++
 anymail/inbound.py       | 126 +----------------------------------
 2 files changed, 140 insertions(+), 125 deletions(-)
 create mode 100644 anymail/_email_compat.py

diff --git a/anymail/_email_compat.py b/anymail/_email_compat.py
new file mode 100644
index 0000000..a760a38
--- /dev/null
+++ b/anymail/_email_compat.py
@@ -0,0 +1,139 @@
+# Work around bugs in older versions of email.parser.Parser
+#
+# This module implements two classes:
+#   EmailParser
+#   EmailBytesParser
+# which can be used like the Python 3.3+ email.parser.Parser
+# and email.parser.BytesParser (with email.policy.default).
+#
+# On Python 2.7, they attempt to work around some bugs/limitations
+# in email.parser.Parser, without trying to back-port the whole
+# Python 3 email package.
+
+__all__ = ['EmailParser', 'EmailBytesParser']
+
+
+from email.parser import Parser
+
+try:
+    # With Python 3.3+ (email6) package, using `policy=email.policy.default`
+    # avoids earlier bugs. (Note that Parser defaults to policy=compat32,
+    # which *preserves* earlier bugs.)
+    from email.policy import default
+    from email.parser import BytesParser
+
+    class EmailParser(Parser):
+        def __init__(self, _class=None, policy=default):  # don't default to compat32 policy
+            super(EmailParser, self).__init__(_class, policy=policy)
+
+    class EmailBytesParser(BytesParser):
+        def __init__(self, _class=None, policy=default):  # don't default to compat32 policy
+            super(EmailBytesParser, self).__init__(_class, policy=policy)
+
+except ImportError:
+    # Pre-Python 3.3 email package: try to work around some bugs
+    from email.header import decode_header
+    from collections import deque
+
+    class EmailParser(Parser):
+        def parse(self, fp, headersonly=False):
+            # Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3).
+            # Help it out by pre-unfolding the headers for it.
+            fp = HeaderUnfoldingWrapper(fp)
+            message = Parser.parse(self, fp, headersonly=headersonly)
+
+            # Older Parser doesn't decode RFC2047 headers, so fix them up here.
+            # (Since messsage is fully parsed, can decode headers in all MIME subparts.)
+            for part in message.walk():
+                part._headers = [  # doesn't seem to be a public API to easily replace all headers
+                    (name, _decode_rfc2047(value))
+                    for name, value in part._headers]
+            return message
+
+    class EmailBytesParser(EmailParser):
+        def parsebytes(self, text, headersonly=False):
+            # In Python 2, bytes is str, and Parser.parsestr uses bytes-friendly cStringIO.StringIO.
+            return self.parsestr(text, headersonly)
+
+    class HeaderUnfoldingWrapper:
+        """
+        A wrapper for file-like objects passed to email.parser.Parser.parse which works
+        around older Parser bugs with folded email headers by pre-unfolding them.
+
+        This only works for headers at the message root, not ones within a MIME subpart.
+        (Accurately recognizing subpart headers would require parsing mixed-content boundaries.)
+        """
+
+        def __init__(self, fp):
+            self.fp = fp
+            self._in_headers = True
+            self._pushback = deque()
+
+        def _readline(self, limit=-1):
+            try:
+                line = self._pushback.popleft()
+            except IndexError:
+                line = self.fp.readline(limit)
+                # cStringIO.readline doesn't recognize universal newlines; splitlines does
+                lines = line.splitlines(True)
+                if len(lines) > 1:
+                    line = lines[0]
+                    self._pushback.extend(lines[1:])
+            return line
+
+        def _peekline(self, limit=-1):
+            try:
+                line = self._pushback[0]
+            except IndexError:
+                line = self._readline(limit)
+                self._pushback.appendleft(line)
+            return line
+
+        def readline(self, limit=-1):
+            line = self._readline(limit)
+            if self._in_headers:
+                line_without_end = line.rstrip("\r\n")  # CRLF, CR, or LF -- "universal newlines"
+                if len(line_without_end) == 0:
+                    # RFC5322 section 2.1: "The body ... is separated from the header section
+                    # by an empty line (i.e., a line with nothing preceding the CRLF)."
+                    self._in_headers = False
+                else:
+                    # Is this header line folded? Need to check next line...
+                    # RFC5322 section 2.2.3: "Unfolding is accomplished by simply removing any CRLF
+                    # that is immediately followed by WSP." (WSP is space or tab)
+                    next_line = self._peekline(limit)
+                    if next_line.startswith((' ', '\t')):
+                        line = line_without_end
+            return line
+
+        def read(self, size):
+            if self._in_headers:
+                # For simplicity, just read a line at a time while in the header section.
+                # (This works because we know email.parser.Parser doesn't really care if it reads
+                # more or less data than it asked for -- it just pushes it into FeedParser either way.)
+                return self.readline(size)
+            elif len(self._pushback):
+                buf = ''.join(self._pushback)
+                self._pushback.clear()
+                return buf
+            else:
+                return self.fp.read(size)
+
+    def _decode_rfc2047(value):
+        result = value
+        decoded_segments = decode_header(value)
+        if any(charset is not None for raw, charset in decoded_segments):
+            # At least one segment is an RFC2047 encoded-word.
+            # Reassemble the segments into a single decoded string.
+            unicode_segments = []
+            prev_charset = None
+            for raw, charset in decoded_segments:
+                if (charset is None or prev_charset is None) and unicode_segments:
+                    # Transitioning to, from, or between *non*-encoded segments:
+                    # add back inter-segment whitespace that decode_header consumed
+                    unicode_segments.append(u" ")
+                decoded = raw.decode(charset, 'replace') if charset is not None else raw
+                unicode_segments.append(decoded)
+                prev_charset = charset
+            result = u"".join(unicode_segments)
+        return result
diff --git a/anymail/inbound.py b/anymail/inbound.py
index 5da235f..ec65215 100644
--- a/anymail/inbound.py
+++ b/anymail/inbound.py
@@ -1,137 +1,13 @@
 from base64 import b64decode
 from email.message import Message
-from email.parser import Parser
 from email.utils import unquote
 
 import six
 from django.core.files.uploadedfile import SimpleUploadedFile
 
+from ._email_compat import EmailParser, EmailBytesParser
 from .utils import angle_wrap, get_content_disposition, parse_address_list, parse_rfc2822date
 
-# Work around bugs in older versions of email.parser.Parser
-try:
-    # With Python 3.3+ (email6) package, using `policy=email.policy.default`
-    # avoids earlier bugs. (Note that Parser defaults to policy=compat32,
-    # which *preserves* earlier bugs.)
-    from email.policy import default
-    from email.parser import BytesParser
-
-    class EmailParser(Parser):
-        def __init__(self, _class=None, policy=default):  # don't default to compat32 policy
-            super(EmailParser, self).__init__(_class, policy=policy)
-
-    class EmailBytesParser(BytesParser):
-        def __init__(self, _class=None, policy=default):  # don't default to compat32 policy
-            super(EmailBytesParser, self).__init__(_class, policy=policy)
-
-except ImportError:
-    # Pre-Python 3.3 email package: try to work around some bugs
-    from email.header import decode_header
-    from collections import deque
-
-    class EmailParser(Parser):
-        def parse(self, fp, headersonly=False):
-            # Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3).
-            # Help it out by pre-unfolding the headers for it.
-            fp = HeaderUnfoldingWrapper(fp)
-            message = Parser.parse(self, fp, headersonly=headersonly)
-
-            # Older Parser doesn't decode RFC2047 headers, so fix them up here.
-            # (Since messsage is fully parsed, can decode headers in all MIME subparts.)
-            for part in message.walk():
-                part._headers = [  # doesn't seem to be a public API to easily replace all headers
-                    (name, _decode_rfc2047(value))
-                    for name, value in part._headers]
-            return message
-
-    class EmailBytesParser(EmailParser):
-        def parsebytes(self, text, headersonly=False):
-            # In Python 2, bytes is str, and Parser.parsestr uses bytes-friendly cStringIO.StringIO.
-            return self.parsestr(text, headersonly)
-
-    class HeaderUnfoldingWrapper:
-        """
-        A wrapper for file-like objects passed to email.parser.Parser.parse which works
-        around older Parser bugs with folded email headers by pre-unfolding them.
-
-        This only works for headers at the message root, not ones within a MIME subpart.
-        (Accurately recognizing subpart headers would require parsing mixed-content boundaries.)
-        """
-
-        def __init__(self, fp):
-            self.fp = fp
-            self._in_headers = True
-            self._pushback = deque()
-
-        def _readline(self, limit=-1):
-            try:
-                line = self._pushback.popleft()
-            except IndexError:
-                line = self.fp.readline(limit)
-                # cStringIO.readline doesn't recognize universal newlines; splitlines does
-                lines = line.splitlines(True)
-                if len(lines) > 1:
-                    line = lines[0]
-                    self._pushback.extend(lines[1:])
-            return line
-
-        def _peekline(self, limit=-1):
-            try:
-                line = self._pushback[0]
-            except IndexError:
-                line = self._readline(limit)
-                self._pushback.appendleft(line)
-            return line
-
-        def readline(self, limit=-1):
-            line = self._readline(limit)
-            if self._in_headers:
-                line_without_end = line.rstrip("\r\n")  # CRLF, CR, or LF -- "universal newlines"
-                if len(line_without_end) == 0:
-                    # RFC5322 section 2.1: "The body ... is separated from the header section
-                    # by an empty line (i.e., a line with nothing preceding the CRLF)."
-                    self._in_headers = False
-                else:
-                    # Is this header line folded? Need to check next line...
-                    # RFC5322 section 2.2.3: "Unfolding is accomplished by simply removing any CRLF
-                    # that is immediately followed by WSP." (WSP is space or tab)
-                    next_line = self._peekline(limit)
-                    if next_line.startswith((' ', '\t')):
-                        line = line_without_end
-            return line
-
-        def read(self, size):
-            if self._in_headers:
-                # For simplicity, just read a line at a time while in the header section.
-                # (This works because we know email.parser.Parser doesn't really care if it reads
-                # more or less data than it asked for -- it just pushes it into FeedParser either way.)
-                return self.readline(size)
-            elif len(self._pushback):
-                buf = ''.join(self._pushback)
-                self._pushback.clear()
-                return buf
-            else:
-                return self.fp.read(size)
-
-    def _decode_rfc2047(value):
-        result = value
-        decoded_segments = decode_header(value)
-        if any(charset is not None for raw, charset in decoded_segments):
-            # At least one segment is an RFC2047 encoded-word.
-            # Reassemble the segments into a single decoded string.
-            unicode_segments = []
-            prev_charset = None
-            for raw, charset in decoded_segments:
-                if (charset is None or prev_charset is None) and unicode_segments:
-                    # Transitioning to, from, or between *non*-encoded segments:
-                    # add back inter-segment whitespace that decode_header consumed
-                    unicode_segments.append(u" ")
-                decoded = raw.decode(charset, 'replace') if charset is not None else raw
-                unicode_segments.append(decoded)
-                prev_charset = charset
-            result = u"".join(unicode_segments)
-        return result
-
 
 class AnymailInboundMessage(Message, object):  # `object` ensures new-style class in Python 2)
     """