Skip to content

Commit

Permalink
Update _parse_file_inplace() to use convert_file_to_utf8().
Browse files Browse the repository at this point in the history
  • Loading branch information
lemon24 committed May 22, 2022
1 parent 0806be0 commit 29eba2c
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 10 deletions.
5 changes: 5 additions & 0 deletions feedparser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,8 @@
# If you want feedparser to automatically sanitize all potentially unsafe
# HTML content, set this to 1.
SANITIZE_HTML = 1


# If you want feedparser to use only a prefix of the feed to detect encodings
# (uses less memory), set this to 1.
OPTIMISTIC_ENCODING_DETECTION = 1
69 changes: 59 additions & 10 deletions feedparser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import xml.sax

from .datetimes import registerDateHandler, _parse_date
from .encodings import convert_to_utf8
from .encodings import convert_file_to_utf8, MissingEncoding
from .html import BaseHTMLProcessor
from . import http
from .mixin import XMLParserMixin
Expand Down Expand Up @@ -156,6 +156,7 @@ def parse(
response_headers: Dict[str, str] = None,
resolve_relative_uris: bool = None,
sanitize_html: bool = None,
optimistic_encoding_detection: bool = None,
) -> FeedParserDict:
"""Parse a feed from a URL, file, stream, or string.
Expand Down Expand Up @@ -199,6 +200,11 @@ def parse(
Should feedparser skip HTML sanitization? Only disable this if you know
what you are doing! Defaults to the value of
:data:`feedparser.SANITIZE_HTML`, which is ``True``.
:param optimistic_encoding_detection:
Should feedparser use only a prefix of the feed to detect encodings
(uses less memory, but the wrong encoding may be detected in rare cases).
Defaults to the value of
:data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``.
"""

Expand Down Expand Up @@ -230,14 +236,17 @@ def parse(
result['headers'].update(response_headers or {})

# TODO (lemon24): remove this once _open_resource() returns an open file
file = io.BytesIO(data)
file = io.BytesIO(data) if isinstance(data, bytes) else io.StringIO(data)

# TODO (lemon24): handle io.UnsupportedOperation raised by seek() attempts

try:
_parse_file_inplace(
file,
result,
resolve_relative_uris=resolve_relative_uris,
sanitize_html=sanitize_html,
optimistic_encoding_detection=optimistic_encoding_detection,
)
finally:
if not hasattr(url_file_stream_or_string, 'read'):
Expand All @@ -253,24 +262,39 @@ def _parse_file_inplace(
*,
resolve_relative_uris: bool = None,
sanitize_html: bool = None,
optimistic_encoding_detection: bool = None,
) -> None:

# TODO (lemon24): remove this once we start using convert_file_to_utf8()
data = file.read()

# Avoid a cyclic import.
import feedparser
if sanitize_html is None:
sanitize_html = bool(feedparser.SANITIZE_HTML)
if resolve_relative_uris is None:
resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
if optimistic_encoding_detection is None:
optimistic_encoding_detection = bool(feedparser.OPTIMISTIC_ENCODING_DETECTION)

stream_factory = convert_file_to_utf8(
result['headers'], file, result, optimistic_encoding_detection
)
# We're done with file, all access must happen through stream_factory.
del file

# Some notes about the stream_factory.get_{text,binary}_file() methods:
#
# Calling them a second time will raise io.UnsupportedOperation
# if the underlying file was not seekable.
#
# Calling close() on the returned file is ignored
# (that is, the underlying file is *not* closed),
# because the SAX parser closes the file when done;
# we don't want that, since we might try again with the loose parser.

data = convert_to_utf8(result['headers'], data, result)
use_json_parser = result['content-type'] == 'application/json'
use_strict_parser = result['encoding'] and True or False

if not use_json_parser:
result['version'], data, entities = replace_doctype(data)
result['version'], stream_factory.prefix, entities = replace_doctype(stream_factory.prefix)

# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
contentloc = result['headers'].get('content-location', '')
Expand All @@ -283,15 +307,18 @@ def _parse_file_inplace(

if not _XML_AVAILABLE:
use_strict_parser = False

feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser]

if use_json_parser:
result['version'] = None
feed_parser = JSONParser(baseuri, baselang, 'utf-8')
try:
feed_parser.feed(io.BytesIO(data))
feed_parser.feed(stream_factory.get_file())
except Exception as e:
result['bozo'] = 1
result['bozo_exception'] = e

elif use_strict_parser:
# Initialize the SAX parser.
feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8')
Expand All @@ -307,7 +334,14 @@ def _parse_file_inplace(
saxparser.setContentHandler(feed_parser)
saxparser.setErrorHandler(feed_parser)
source = xml.sax.xmlreader.InputSource()
source.setByteStream(io.BytesIO(data))

# If an encoding was detected, decode the file on the fly;
# otherwise, pass it as-is and let the SAX parser deal with it.
try:
source.setCharacterStream(stream_factory.get_text_file())
except MissingEncoding:
source.setByteStream(stream_factory.get_binary_file())

try:
saxparser.parse(source)
except xml.sax.SAXException as e:
Expand All @@ -321,7 +355,22 @@ def _parse_file_inplace(
feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
feed_parser.resolve_relative_uris = resolve_relative_uris
feed_parser.sanitize_html = sanitize_html
feed_parser.feed(data.decode('utf-8', 'replace'))

# If an encoding was detected, use it; otherwise, assume utf-8 and do your best.
# Will raise io.UnsupportedOperation if the underlying file is not seekable.
data = stream_factory.get_text_file('utf-8', 'replace').read()

# As of 6.0.8, LooseFeedParser.feed() can be called exactly once
# with the entire data (it does some re.sub() and str.replace() on it).
#
# SGMLParser (of which LooseFeedParser is a subclass)
# *can* be fed in a streaming fashion,
# by calling feed() repeatedly with chunks of text.
#
# When/if LooseFeedParser will support being fed chunks,
# replace the read() call above with read(size)/feed() calls in a loop.

feed_parser.feed(data)

result['feed'] = feed_parser.feeddata
result['entries'] = feed_parser.entries
Expand Down
31 changes: 31 additions & 0 deletions tests/runtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,6 +1046,37 @@ def test_resolve_relative_uris_off(self):
resolve_relative_uris=False)
self.assertEqual(u'<a href="/boo.html">boo</a>', d.entries[1].content[0].value)

def test_optimistic_encoding_detection(self):
length = feedparser.encodings.CONVERT_FILE_PREFIX_LEN
digits = '0123456789abcdef😀'
description = digits * int(length / len(digits) * 1.5)

feed_xml = f"""
<rss version="2.0">
<channel>
<item>
<guid isPermaLink="false">id</guid>
<description>{description}</description>
</item>
</channel>
</rss>
"""

kwargs_params = {
'default': dict(),
'on': dict(optimistic_encoding_detection=True),
'off': dict(optimistic_encoding_detection=False),
}
input_params = {
'binary_file': lambda: io.BytesIO(feed_xml.encode('utf-8')),
'text_file': lambda: io.StringIO(feed_xml),
}

for kwargs_name, kwargs in kwargs_params.items():
for input_name, make_input in input_params.items():
with self.subTest(f"{kwargs_name} {input_name}"):
d = feedparser.parse(make_input(), **kwargs)
self.assertEqual(d.entries[0].description, description)

class TestSanitizer(unittest.TestCase):
def test_style_attr_is_enabled(self):
Expand Down

0 comments on commit 29eba2c

Please sign in to comment.