Internal: move lengthy compatibility EmailParser into own file

This commit is contained in:
medmunds
2018-04-01 15:24:18 -07:00
parent dbe48d48af
commit 008aef237e
2 changed files with 140 additions and 125 deletions

139
anymail/_email_compat.py Normal file
View File

@@ -0,0 +1,139 @@
# Work around bugs in older versions of email.parser.Parser
#
# This module implements two classes:
# EmailParser
# EmailBytesParser
# which can be used like the Python 3.3+ email.parser.Parser
# and email.parser.BytesParser (with email.policy.default).
#
# On Python 2.7, they attempt to work around some bugs/limitations
# in email.parser.Parser, without trying to back-port the whole
# Python 3 email package.
__all__ = ['EmailParser', 'EmailBytesParser']
from email.parser import Parser
try:
# With Python 3.3+ (email6) package, using `policy=email.policy.default`
# avoids earlier bugs. (Note that Parser defaults to policy=compat32,
# which *preserves* earlier bugs.)
from email.policy import default
from email.parser import BytesParser
class EmailParser(Parser):
def __init__(self, _class=None, policy=default): # don't default to compat32 policy
super(EmailParser, self).__init__(_class, policy=policy)
class EmailBytesParser(BytesParser):
def __init__(self, _class=None, policy=default): # don't default to compat32 policy
super(EmailBytesParser, self).__init__(_class, policy=policy)
except ImportError:
# Pre-Python 3.3 email package: try to work around some bugs
from email.header import decode_header
from collections import deque
class EmailParser(Parser):
def parse(self, fp, headersonly=False):
# Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3).
# Help it out by pre-unfolding the headers for it.
fp = HeaderUnfoldingWrapper(fp)
message = Parser.parse(self, fp, headersonly=headersonly)
# Older Parser doesn't decode RFC2047 headers, so fix them up here.
# (Since messsage is fully parsed, can decode headers in all MIME subparts.)
for part in message.walk():
part._headers = [ # doesn't seem to be a public API to easily replace all headers
(name, _decode_rfc2047(value))
for name, value in part._headers]
return message
class EmailBytesParser(EmailParser):
def parsebytes(self, text, headersonly=False):
# In Python 2, bytes is str, and Parser.parsestr uses bytes-friendly cStringIO.StringIO.
return self.parsestr(text, headersonly)
class HeaderUnfoldingWrapper:
"""
A wrapper for file-like objects passed to email.parser.Parser.parse which works
around older Parser bugs with folded email headers by pre-unfolding them.
This only works for headers at the message root, not ones within a MIME subpart.
(Accurately recognizing subpart headers would require parsing mixed-content boundaries.)
"""
def __init__(self, fp):
self.fp = fp
self._in_headers = True
self._pushback = deque()
def _readline(self, limit=-1):
try:
line = self._pushback.popleft()
except IndexError:
line = self.fp.readline(limit)
# cStringIO.readline doesn't recognize universal newlines; splitlines does
lines = line.splitlines(True)
if len(lines) > 1:
line = lines[0]
self._pushback.extend(lines[1:])
return line
def _peekline(self, limit=-1):
try:
line = self._pushback[0]
except IndexError:
line = self._readline(limit)
self._pushback.appendleft(line)
return line
def readline(self, limit=-1):
line = self._readline(limit)
if self._in_headers:
line_without_end = line.rstrip("\r\n") # CRLF, CR, or LF -- "universal newlines"
if len(line_without_end) == 0:
# RFC5322 section 2.1: "The body ... is separated from the header section
# by an empty line (i.e., a line with nothing preceding the CRLF)."
self._in_headers = False
else:
# Is this header line folded? Need to check next line...
# RFC5322 section 2.2.3: "Unfolding is accomplished by simply removing any CRLF
# that is immediately followed by WSP." (WSP is space or tab)
next_line = self._peekline(limit)
if next_line.startswith((' ', '\t')):
line = line_without_end
return line
def read(self, size):
if self._in_headers:
# For simplicity, just read a line at a time while in the header section.
# (This works because we know email.parser.Parser doesn't really care if it reads
# more or less data than it asked for -- it just pushes it into FeedParser either way.)
return self.readline(size)
elif len(self._pushback):
buf = ''.join(self._pushback)
self._pushback.clear()
return buf
else:
return self.fp.read(size)
def _decode_rfc2047(value):
result = value
decoded_segments = decode_header(value)
if any(charset is not None for raw, charset in decoded_segments):
# At least one segment is an RFC2047 encoded-word.
# Reassemble the segments into a single decoded string.
unicode_segments = []
prev_charset = None
for raw, charset in decoded_segments:
if (charset is None or prev_charset is None) and unicode_segments:
# Transitioning to, from, or between *non*-encoded segments:
# add back inter-segment whitespace that decode_header consumed
unicode_segments.append(u" ")
decoded = raw.decode(charset, 'replace') if charset is not None else raw
unicode_segments.append(decoded)
prev_charset = charset
result = u"".join(unicode_segments)
return result

View File

@@ -1,137 +1,13 @@
from base64 import b64decode from base64 import b64decode
from email.message import Message from email.message import Message
from email.parser import Parser
from email.utils import unquote from email.utils import unquote
import six import six
from django.core.files.uploadedfile import SimpleUploadedFile from django.core.files.uploadedfile import SimpleUploadedFile
from ._email_compat import EmailParser, EmailBytesParser
from .utils import angle_wrap, get_content_disposition, parse_address_list, parse_rfc2822date from .utils import angle_wrap, get_content_disposition, parse_address_list, parse_rfc2822date
# Work around bugs in older versions of email.parser.Parser
try:
# With Python 3.3+ (email6) package, using `policy=email.policy.default`
# avoids earlier bugs. (Note that Parser defaults to policy=compat32,
# which *preserves* earlier bugs.)
from email.policy import default
from email.parser import BytesParser
class EmailParser(Parser):
def __init__(self, _class=None, policy=default): # don't default to compat32 policy
super(EmailParser, self).__init__(_class, policy=policy)
class EmailBytesParser(BytesParser):
def __init__(self, _class=None, policy=default): # don't default to compat32 policy
super(EmailBytesParser, self).__init__(_class, policy=policy)
except ImportError:
# Pre-Python 3.3 email package: try to work around some bugs
from email.header import decode_header
from collections import deque
class EmailParser(Parser):
def parse(self, fp, headersonly=False):
# Older Parser doesn't correctly unfold headers (RFC5322 section 2.2.3).
# Help it out by pre-unfolding the headers for it.
fp = HeaderUnfoldingWrapper(fp)
message = Parser.parse(self, fp, headersonly=headersonly)
# Older Parser doesn't decode RFC2047 headers, so fix them up here.
# (Since messsage is fully parsed, can decode headers in all MIME subparts.)
for part in message.walk():
part._headers = [ # doesn't seem to be a public API to easily replace all headers
(name, _decode_rfc2047(value))
for name, value in part._headers]
return message
class EmailBytesParser(EmailParser):
def parsebytes(self, text, headersonly=False):
# In Python 2, bytes is str, and Parser.parsestr uses bytes-friendly cStringIO.StringIO.
return self.parsestr(text, headersonly)
class HeaderUnfoldingWrapper:
"""
A wrapper for file-like objects passed to email.parser.Parser.parse which works
around older Parser bugs with folded email headers by pre-unfolding them.
This only works for headers at the message root, not ones within a MIME subpart.
(Accurately recognizing subpart headers would require parsing mixed-content boundaries.)
"""
def __init__(self, fp):
self.fp = fp
self._in_headers = True
self._pushback = deque()
def _readline(self, limit=-1):
try:
line = self._pushback.popleft()
except IndexError:
line = self.fp.readline(limit)
# cStringIO.readline doesn't recognize universal newlines; splitlines does
lines = line.splitlines(True)
if len(lines) > 1:
line = lines[0]
self._pushback.extend(lines[1:])
return line
def _peekline(self, limit=-1):
try:
line = self._pushback[0]
except IndexError:
line = self._readline(limit)
self._pushback.appendleft(line)
return line
def readline(self, limit=-1):
line = self._readline(limit)
if self._in_headers:
line_without_end = line.rstrip("\r\n") # CRLF, CR, or LF -- "universal newlines"
if len(line_without_end) == 0:
# RFC5322 section 2.1: "The body ... is separated from the header section
# by an empty line (i.e., a line with nothing preceding the CRLF)."
self._in_headers = False
else:
# Is this header line folded? Need to check next line...
# RFC5322 section 2.2.3: "Unfolding is accomplished by simply removing any CRLF
# that is immediately followed by WSP." (WSP is space or tab)
next_line = self._peekline(limit)
if next_line.startswith((' ', '\t')):
line = line_without_end
return line
def read(self, size):
if self._in_headers:
# For simplicity, just read a line at a time while in the header section.
# (This works because we know email.parser.Parser doesn't really care if it reads
# more or less data than it asked for -- it just pushes it into FeedParser either way.)
return self.readline(size)
elif len(self._pushback):
buf = ''.join(self._pushback)
self._pushback.clear()
return buf
else:
return self.fp.read(size)
def _decode_rfc2047(value):
result = value
decoded_segments = decode_header(value)
if any(charset is not None for raw, charset in decoded_segments):
# At least one segment is an RFC2047 encoded-word.
# Reassemble the segments into a single decoded string.
unicode_segments = []
prev_charset = None
for raw, charset in decoded_segments:
if (charset is None or prev_charset is None) and unicode_segments:
# Transitioning to, from, or between *non*-encoded segments:
# add back inter-segment whitespace that decode_header consumed
unicode_segments.append(u" ")
decoded = raw.decode(charset, 'replace') if charset is not None else raw
unicode_segments.append(decoded)
prev_charset = charset
result = u"".join(unicode_segments)
return result
class AnymailInboundMessage(Message, object): # `object` ensures new-style class in Python 2) class AnymailInboundMessage(Message, object): # `object` ensures new-style class in Python 2)
""" """