Add stream-oriented version of convert_to_utf8().

For kurtmckee#296.
lemon24 · Jan 24, 2022 · 80b31af · 80b31af
1 parent 3a806ae
commit 80b31af
Show file tree

Hide file tree

Showing 2 changed files with 297 additions and 0 deletions.
diff --git a/feedparser/encodings.py b/feedparser/encodings.py
@@ -28,6 +28,7 @@
 
 import cgi
 import codecs
+import io
 import re
 
 try:
@@ -46,6 +47,7 @@ def lazy_chardet_encoding(data):
     CharacterEncodingUnknown,
     NonXMLContentType,
 )
+from .sanitizer import replace_doctype
 
 
 # Each marker represents some of the characters of the opening XML
@@ -291,3 +293,212 @@ def convert_to_utf8(http_headers, data, result):
         result['bozo'] = True
         result['bozo_exception'] = error
     return data
+
+
+def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection=True):
+    """Like convert_to_utf8(), but for a binary stream.
+
+    Unlike convert_to_utf8(), do not read the the entire file in memory;
+    instead, return a text stream that decodes it on the fly.
+    This should consume significantly less memory,
+    because it avoids (repeatedly) converting the entire file contents
+    from bytes to str and back.
+
+    To detect the encoding, only a prefix of the file contents is used.
+    In rare cases, the wrong encoding may be detected for this prefix;
+    use optimistic_encoding_detection=False to use the entire file contents
+    (equivalent to a plain convert_to_utf8() call).
+
+    In addition to convert_to_utf8(),
+    call .sanitizer.replace_doctype() on the contents of the stream,
+    and set result['version'] to the returned version.
+
+    Args:
+        http_headers (dict): The response headers.
+        file (typing.IO[bytes]): A read()-able binary stream.
+        result (dict): The result dictionary.
+        optimistic_encoding_detection (bool):
+            If true, use only a prefix of the file content to detect encoding.
+
+    Returns:
+        tuple(StreamFactory, dict):
+        A pair of:
+        * a stream factory, with the detected encoding set, if any
+        * the safe_entities dict returned by replace_doctype()
+
+    """
+    # Currently, this wraps convert_to_utf8(), because the logic is simply
+    # too complicated to ensure it's re-implemented correctly for a stream.
+    # That said, it should be possible to change the implementation
+    # transparently (not sure it's worth it, though).
+    #
+    # We are forced to call replace_doctype() here, because the returned
+    # data may be shorter than the input, and we need to be able to
+    # stitch the stream back correctly. To have a single file wrapper,
+    # all the changes must happen in one place.
+    # Doing it here keeps the code cleaner overall.
+
+    if optimistic_encoding_detection:
+        prefix = convert_file_prefix_to_utf8(http_headers, file, result)
+        result['version'], prefix, entities = replace_doctype(prefix)
+        file = PrefixFileWrapper(prefix, file)
+
+    else:
+        # this shouldn't increase memory usage if file is BytesIO,
+        # since BytesIO does copy-on-write; https://bugs.python.org/issue22003
+        data = convert_to_utf8(http_headers, file.read(), result)
+        result['version'], data, entities = replace_doctype(data)
+        # still need to be able to reset() to the "beginning"
+        file = PrefixFileWrapper(b'', io.BytesIO(data))
+
+    return StreamFactory(file, result.get('encoding')), entities
+
+
+# In inital tests, 4k was enough for ~160 mostly-English feeds;
+# 64k seems like a safe margin.
+CONVERT_FILE_PREFIX_LEN = 2 ** 16
+
+def convert_file_prefix_to_utf8(http_headers, file, result, prefix_len=CONVERT_FILE_PREFIX_LEN):
+    """Like convert_to_utf8(), but only use the prefix of a binary file.
+
+    Set result like convert_to_utf8() would.
+
+    Return the updated prefix, as bytes.
+
+    """
+    prefix = file.read(prefix_len)
+
+    # we call convert_to_utf8() up to 4 times,
+    # to make sure we eventually land on a code point boundary
+    for _ in range(4):
+        fake_result = {}
+        converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
+        if not fake_result.get('bozo'):
+            break
+
+        # check if the prefix we have is actually the whole thing
+        if len(prefix) < prefix_len:
+            break
+
+        byte = file.read(1)
+        if not byte:
+            break
+
+        prefix += byte
+        prefix_len += 1
+
+    result.update(fake_result)
+    return converted_prefix
+
+
+class PrefixFileWrapper:
+    """Stitch a (possibly modified) prefix and a file into a new file object.
+
+    If the underlying file is seekable, it possible to read()
+    the same content again by calling reset().
+
+    >>> file = io.StringIO('abcdef')
+    >>> file.read(2)
+    'ab'
+    >>> wrapped = PrefixFileWrapper(file.read(2).upper(), file)
+    >>> wrapped.read()
+    'CDef'
+    >>> wrapped.reset()
+    >>> wrapped.read()
+    'CDef'
+    >>>
+
+    """
+
+    def __init__(self, prefix, file):
+        self.prefix = prefix
+        self.file = file
+
+        try:
+            self.file_initial_offset = file.tell()
+        except OSError:
+            self.file_initial_offset = None
+
+        self.offset = 0
+
+    def reset(self):
+        # raises io.UnsupportedOperation if the underlying stream is not seekable
+        self.file.seek(self.file_initial_offset)
+        self.offset = 0
+
+    def read(self, size=-1):
+        buffer = self.file.read(0)
+
+        if self.offset < len(self.prefix):
+            if size < 0:
+                chunk = self.prefix
+            else:
+                chunk = self.prefix[self.offset : self.offset+size]
+                size -= len(chunk)
+            buffer += chunk
+            self.offset += len(chunk)
+
+        while True:
+            chunk = self.file.read(size)
+            if not chunk:
+                break
+            buffer += chunk
+            self.offset += len(chunk)
+
+            if size <= 0:
+                break
+
+            size -= len(chunk)
+
+        return buffer
+
+    def close(self):
+        # do not touch the underlying stream
+        pass
+
+
+class MissingEncoding(io.UnsupportedOperation):
+    pass
+
+
+class StreamFactory:
+
+    """Decode on the fly a binary stream that *may* have a known encoding.
+
+    If the underlying stream has a reset() method,
+    it is possible to call the get_..._file() methods more than once.
+
+    """
+    # This could be implemented as a file-like object whose read()
+    # returns either bytes or str, but it would be very difficult
+    # to get it to work with mypy.
+    #
+    # Having two different methods, one returning IO[str], and one IO[bytes],
+    # is much easier to type; also, it's better at showing intent.
+
+    def __init__(self, file, encoding=None):
+        self.file = file
+        self.encoding = encoding
+        self.should_reset = False
+
+    def get_text_file(self, fallback_encoding=None, errors='strict'):
+        encoding = self.encoding or fallback_encoding
+        if encoding is None:
+            raise MissingEncoding("cannot create text stream without encoding")
+        reader_factory = codecs.getreader(encoding)
+        reader = reader_factory(self.file, errors)
+        self.reset()
+        return reader
+
+    def get_binary_file(self):
+        self.reset()
+        return self.file
+
+    def reset(self):
+        if self.should_reset:
+            try:
+                self.file.reset()
+            except AttributeError:
+                raise io.UnsupportedOperation("underlying stream cannot be reset")
+        self.should_reset = True
+
diff --git a/tests/runtests.py b/tests/runtests.py
@@ -49,6 +49,7 @@
 import feedparser
 import feedparser.api
 import feedparser.datetimes
+import feedparser.encodings
 import feedparser.http
 import feedparser.mixin
 import feedparser.sanitizer
@@ -294,6 +295,91 @@ def test_gb2312_converted_to_gb18030_in_xml_encoding(self):
         })
         self.assertEqual(result.encoding, 'gb18030')
 
+    def test_prefix_file_wrapper_not_seekable(self):
+        f = feedparser.encodings.PrefixFileWrapper(b'abc', _make_file_not_seekable(b'def'))
+
+        self.assertEqual(f.read() , b'abcdef')
+        self.assertEqual(f.read() , b'')
+        with self.assertRaises(io.UnsupportedOperation):
+            f.reset()
+        self.assertEqual(f.read() , b'')
+
+        f = feedparser.encodings.PrefixFileWrapper(b'abc', _make_file_not_seekable(b'def'))
+
+        self.assertEqual(f.read(3) , b'abc')
+        with self.assertRaises(io.UnsupportedOperation):
+            f.reset()
+        self.assertEqual(f.read() , b'def')
+
+    def test_prefix_file_wrapper_no_prefix(self):
+        f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc'))
+        self.assertEqual(f.read(1) , b'a')
+        self.assertEqual(f.read() , b'bc')
+
+        f.reset()
+        self.assertEqual(f.read() , b'abc')
+
+
+def make_prefix_file_wrapper_test(make_file):
+
+    def test(self):
+        f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
+
+        self.assertEqual(f.read() , b'abcdef')
+        self.assertEqual(f.read() , b'')
+
+        f.reset()
+        self.assertEqual(f.read(2) , b'ab')
+        self.assertEqual(f.read(2) , b'cd')
+        self.assertEqual(f.read(2) , b'ef')
+        self.assertEqual(f.read(2) , b'')
+        self.assertEqual(f.read() , b'')
+
+        f.reset()
+        self.assertEqual(f.read(3) , b'abc')
+        self.assertEqual(f.read(3) , b'def')
+        self.assertEqual(f.read(3) , b'')
+        self.assertEqual(f.read() , b'')
+
+        f.reset()
+        self.assertEqual(f.read(0) , b'')
+        self.assertEqual(f.read() , b'abcdef')
+
+        f.reset()
+        f.reset()
+        self.assertEqual(f.read() , b'abcdef')
+
+    return test
+
+
+def _make_file_in_the_middle(data):
+    prefix = b'zzzzz'
+    rv = io.BytesIO(prefix + data)
+    rv.seek(len(prefix))
+    return rv
+
+class _make_file_one_by_one(io.BytesIO):
+    def read(self, size=-1):
+        if size <= 0:
+            return super().read(size)
+        return super().read(1)
+
+class _make_file_not_seekable(io.BytesIO):
+    def tell(self):
+        raise io.UnsupportedOperation
+    def seek(self, *args):
+        raise io.UnsupportedOperation
+
+prefix_file_wrapper_file_factories = [
+    io.BytesIO,
+    _make_file_in_the_middle,
+    _make_file_one_by_one,
+]
+
+for factory in prefix_file_wrapper_file_factories:
+    func = make_prefix_file_wrapper_test(factory)
+    setattr(TestEncodings, 'test_prefix_file_wrapper_%s' % func.__name__.lstrip('_'), func)
+
 
 class TestFeedParserDict(unittest.TestCase):
     """Ensure that FeedParserDict returns values as expected and won't crash"""