Skip to content

Commit

Permalink
Add stream-oriented version of convert_to_utf8().
Browse files Browse the repository at this point in the history
  • Loading branch information
lemon24 committed Jan 24, 2022
1 parent 3a806ae commit 80b31af
Show file tree
Hide file tree
Showing 2 changed files with 297 additions and 0 deletions.
211 changes: 211 additions & 0 deletions feedparser/encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

import cgi
import codecs
import io
import re

try:
Expand All @@ -46,6 +47,7 @@ def lazy_chardet_encoding(data):
CharacterEncodingUnknown,
NonXMLContentType,
)
from .sanitizer import replace_doctype


# Each marker represents some of the characters of the opening XML
Expand Down Expand Up @@ -291,3 +293,212 @@ def convert_to_utf8(http_headers, data, result):
result['bozo'] = True
result['bozo_exception'] = error
return data


def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection=True):
"""Like convert_to_utf8(), but for a binary stream.
Unlike convert_to_utf8(), do not read the the entire file in memory;
instead, return a text stream that decodes it on the fly.
This should consume significantly less memory,
because it avoids (repeatedly) converting the entire file contents
from bytes to str and back.
To detect the encoding, only a prefix of the file contents is used.
In rare cases, the wrong encoding may be detected for this prefix;
use optimistic_encoding_detection=False to use the entire file contents
(equivalent to a plain convert_to_utf8() call).
In addition to convert_to_utf8(),
call .sanitizer.replace_doctype() on the contents of the stream,
and set result['version'] to the returned version.
Args:
http_headers (dict): The response headers.
file (typing.IO[bytes]): A read()-able binary stream.
result (dict): The result dictionary.
optimistic_encoding_detection (bool):
If true, use only a prefix of the file content to detect encoding.
Returns:
tuple(StreamFactory, dict):
A pair of:
* a stream factory, with the detected encoding set, if any
* the safe_entities dict returned by replace_doctype()
"""
# Currently, this wraps convert_to_utf8(), because the logic is simply
# too complicated to ensure it's re-implemented correctly for a stream.
# That said, it should be possible to change the implementation
# transparently (not sure it's worth it, though).
#
# We are forced to call replace_doctype() here, because the returned
# data may be shorter than the input, and we need to be able to
# stitch the stream back correctly. To have a single file wrapper,
# all the changes must happen in one place.
# Doing it here keeps the code cleaner overall.

if optimistic_encoding_detection:
prefix = convert_file_prefix_to_utf8(http_headers, file, result)
result['version'], prefix, entities = replace_doctype(prefix)
file = PrefixFileWrapper(prefix, file)

else:
# this shouldn't increase memory usage if file is BytesIO,
# since BytesIO does copy-on-write; https://bugs.python.org/issue22003
data = convert_to_utf8(http_headers, file.read(), result)
result['version'], data, entities = replace_doctype(data)
# still need to be able to reset() to the "beginning"
file = PrefixFileWrapper(b'', io.BytesIO(data))

return StreamFactory(file, result.get('encoding')), entities


# In inital tests, 4k was enough for ~160 mostly-English feeds;
# 64k seems like a safe margin.
CONVERT_FILE_PREFIX_LEN = 2 ** 16

def convert_file_prefix_to_utf8(http_headers, file, result, prefix_len=CONVERT_FILE_PREFIX_LEN):
"""Like convert_to_utf8(), but only use the prefix of a binary file.
Set result like convert_to_utf8() would.
Return the updated prefix, as bytes.
"""
prefix = file.read(prefix_len)

# we call convert_to_utf8() up to 4 times,
# to make sure we eventually land on a code point boundary
for _ in range(4):
fake_result = {}
converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
if not fake_result.get('bozo'):
break

# check if the prefix we have is actually the whole thing
if len(prefix) < prefix_len:
break

byte = file.read(1)
if not byte:
break

prefix += byte
prefix_len += 1

result.update(fake_result)
return converted_prefix


class PrefixFileWrapper:
"""Stitch a (possibly modified) prefix and a file into a new file object.
If the underlying file is seekable, it possible to read()
the same content again by calling reset().
>>> file = io.StringIO('abcdef')
>>> file.read(2)
'ab'
>>> wrapped = PrefixFileWrapper(file.read(2).upper(), file)
>>> wrapped.read()
'CDef'
>>> wrapped.reset()
>>> wrapped.read()
'CDef'
>>>
"""

def __init__(self, prefix, file):
self.prefix = prefix
self.file = file

try:
self.file_initial_offset = file.tell()
except OSError:
self.file_initial_offset = None

self.offset = 0

def reset(self):
# raises io.UnsupportedOperation if the underlying stream is not seekable
self.file.seek(self.file_initial_offset)
self.offset = 0

def read(self, size=-1):
buffer = self.file.read(0)

if self.offset < len(self.prefix):
if size < 0:
chunk = self.prefix
else:
chunk = self.prefix[self.offset : self.offset+size]
size -= len(chunk)
buffer += chunk
self.offset += len(chunk)

while True:
chunk = self.file.read(size)
if not chunk:
break
buffer += chunk
self.offset += len(chunk)

if size <= 0:
break

size -= len(chunk)

return buffer

def close(self):
# do not touch the underlying stream
pass


class MissingEncoding(io.UnsupportedOperation):
pass


class StreamFactory:

"""Decode on the fly a binary stream that *may* have a known encoding.
If the underlying stream has a reset() method,
it is possible to call the get_..._file() methods more than once.
"""
# This could be implemented as a file-like object whose read()
# returns either bytes or str, but it would be very difficult
# to get it to work with mypy.
#
# Having two different methods, one returning IO[str], and one IO[bytes],
# is much easier to type; also, it's better at showing intent.

def __init__(self, file, encoding=None):
self.file = file
self.encoding = encoding
self.should_reset = False

def get_text_file(self, fallback_encoding=None, errors='strict'):
encoding = self.encoding or fallback_encoding
if encoding is None:
raise MissingEncoding("cannot create text stream without encoding")
reader_factory = codecs.getreader(encoding)
reader = reader_factory(self.file, errors)
self.reset()
return reader

def get_binary_file(self):
self.reset()
return self.file

def reset(self):
if self.should_reset:
try:
self.file.reset()
except AttributeError:
raise io.UnsupportedOperation("underlying stream cannot be reset")
self.should_reset = True

86 changes: 86 additions & 0 deletions tests/runtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import feedparser
import feedparser.api
import feedparser.datetimes
import feedparser.encodings
import feedparser.http
import feedparser.mixin
import feedparser.sanitizer
Expand Down Expand Up @@ -294,6 +295,91 @@ def test_gb2312_converted_to_gb18030_in_xml_encoding(self):
})
self.assertEqual(result.encoding, 'gb18030')

def test_prefix_file_wrapper_not_seekable(self):
f = feedparser.encodings.PrefixFileWrapper(b'abc', _make_file_not_seekable(b'def'))

self.assertEqual(f.read() , b'abcdef')
self.assertEqual(f.read() , b'')
with self.assertRaises(io.UnsupportedOperation):
f.reset()
self.assertEqual(f.read() , b'')

f = feedparser.encodings.PrefixFileWrapper(b'abc', _make_file_not_seekable(b'def'))

self.assertEqual(f.read(3) , b'abc')
with self.assertRaises(io.UnsupportedOperation):
f.reset()
self.assertEqual(f.read() , b'def')

def test_prefix_file_wrapper_no_prefix(self):
f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc'))
self.assertEqual(f.read(1) , b'a')
self.assertEqual(f.read() , b'bc')

f.reset()
self.assertEqual(f.read() , b'abc')


def make_prefix_file_wrapper_test(make_file):

def test(self):
f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))

self.assertEqual(f.read() , b'abcdef')
self.assertEqual(f.read() , b'')

f.reset()
self.assertEqual(f.read(2) , b'ab')
self.assertEqual(f.read(2) , b'cd')
self.assertEqual(f.read(2) , b'ef')
self.assertEqual(f.read(2) , b'')
self.assertEqual(f.read() , b'')

f.reset()
self.assertEqual(f.read(3) , b'abc')
self.assertEqual(f.read(3) , b'def')
self.assertEqual(f.read(3) , b'')
self.assertEqual(f.read() , b'')

f.reset()
self.assertEqual(f.read(0) , b'')
self.assertEqual(f.read() , b'abcdef')

f.reset()
f.reset()
self.assertEqual(f.read() , b'abcdef')

return test


def _make_file_in_the_middle(data):
prefix = b'zzzzz'
rv = io.BytesIO(prefix + data)
rv.seek(len(prefix))
return rv

class _make_file_one_by_one(io.BytesIO):
def read(self, size=-1):
if size <= 0:
return super().read(size)
return super().read(1)

class _make_file_not_seekable(io.BytesIO):
def tell(self):
raise io.UnsupportedOperation
def seek(self, *args):
raise io.UnsupportedOperation

prefix_file_wrapper_file_factories = [
io.BytesIO,
_make_file_in_the_middle,
_make_file_one_by_one,
]

for factory in prefix_file_wrapper_file_factories:
func = make_prefix_file_wrapper_test(factory)
setattr(TestEncodings, 'test_prefix_file_wrapper_%s' % func.__name__.lstrip('_'), func)


class TestFeedParserDict(unittest.TestCase):
"""Ensure that FeedParserDict returns values as expected and won't crash"""
Expand Down

0 comments on commit 80b31af

Please sign in to comment.