Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support grabbing TeX source from MathML <annotation>s #27

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
fdb788e
Some Sanitizer fixes
distler Jul 26, 2012
eea8014
Support for TeX source in <annotation>
distler Dec 12, 2013
8693312
Support LaTeX in img.latex
distler Dec 23, 2013
7e53bed
Update html5lib to 0.999
olberger Jan 27, 2014
68a49a8
Update feedparser to 5.1.3
olberger Jan 27, 2014
e32fa7c
Fix errors in test suite after html5lib recent changes
olberger Jan 27, 2014
17af58a
Fix test cases errors on AttributeError: object has no attribute 'sum…
olberger Jan 27, 2014
ca213d7
Fix test_missing_item_pubDate no longer passing
olberger Jan 27, 2014
5581e07
Fix issue on running filters
olberger Jan 27, 2014
e7f2fc8
Fix date values error in templates
olberger Jan 27, 2014
f891e7f
Change XML test data so that xmlns are correctly set
olberger Jan 27, 2014
cc52488
Try and fix Django settings issue
olberger Jan 27, 2014
88fa100
Proposed changes as feedparser seems to have a different behaviour.
olberger Jan 27, 2014
b0492c0
Fix scrubbing issue
olberger Jan 27, 2014
01b09c3
More support for grabbing TeX source
distler Jan 31, 2014
124d5f8
Update feedparser
distler Mar 30, 2014
7ff1b74
Merge branch 'update-deps' of https://github.com/olberger/venus
distler Mar 30, 2014
af1a936
Update html5lib
distler Mar 30, 2014
13e8fc3
Yuck! That's a weird bug in HTML5lib.
distler Mar 30, 2014
014375f
STIX Two + Astral plane characters in "narrow" Python builds
distler Dec 16, 2016
89cd35f
Might as well do this one too
distler Dec 17, 2016
3405bc1
Unvendor feedparser and httplib2
distler Dec 25, 2018
c0b3da2
Fixes for new FeedParser
distler Dec 27, 2018
9ccf292
A bit more responsive
distler Jan 7, 2019
da2b0ca
Fix remaining broken tests
distler Jan 8, 2019
b537638
Update HTML5lib Sanitizer
distler Mar 2, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion filters/html2xhtml.plugin
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sys
import html5lib
tree=html5lib.treebuilders.dom.TreeBuilder
tree=html5lib.treebuilders.getTreeBuilder('dom')
parser = html5lib.html5parser.HTMLParser(tree=tree)
document = parser.parse(sys.stdin)
sys.stdout.write(document.toxml("utf-8"))
23 changes: 14 additions & 9 deletions planet/reconstitute.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
import re, time, sgmllib
from xml.sax.saxutils import escape
from xml.dom import minidom, Node
from html5lib import html5parser
from html5lib.treebuilders import dom
from html5lib import html5parser, treebuilders
import planet, config

try:
Expand Down Expand Up @@ -75,8 +74,8 @@ def id(xentry, entry):
entry_id = entry.link
elif entry.has_key("title") and entry.title:
entry_id = (entry.title_detail.base + "/" +
md5(entry.title).hexdigest())
elif entry.has_key("summary") and entry.summary:
md5(entry.title.encode('utf-8')).hexdigest())
elif entry.has_key("summary") and entry.summary and entry.has_key("summary_detail") and entry.summary_detail:
entry_id = (entry.summary_detail.base + "/" +
md5(entry.summary).hexdigest())
elif entry.has_key("content") and entry.content:
Expand Down Expand Up @@ -168,7 +167,7 @@ def content(xentry, name, detail, bozo):
bozo=1

if detail.type.find('xhtml')<0 or bozo:
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
html = parser.parse(xdiv % detail.value, encoding="utf-8")
for body in html.documentElement.childNodes:
if body.nodeType != Node.ELEMENT_NODE: continue
Expand Down Expand Up @@ -208,7 +207,7 @@ def location(xentry, long, lat):

xlat = createTextElement(xentry, '%s:%s' % ('geo','lat'), '%f' % lat)
xlat.setAttribute('xmlns:%s' % 'geo', 'http://www.w3.org/2003/01/geo/wgs84_pos#')
xlong = createTextElement(xentry, '%s:%s' % ('geo','long'), '%f' % long)
xlong = createTextElement(xentry, '%s:%s' % ('geo','long'), '%.6f' % long)
xlong.setAttribute('xmlns:%s' % 'geo', 'http://www.w3.org/2003/01/geo/wgs84_pos#')

xentry.appendChild(xlat)
Expand Down Expand Up @@ -305,8 +304,13 @@ def reconstitute(feed, entry):
coordinates = where.get('coordinates',None)
if type == 'Point':
location(xentry, coordinates[0], coordinates[1])
elif type == 'Box' or type == 'LineString' or type == 'Polygon':
location(xentry, coordinates[0][0], coordinates[0][1])
elif type == 'Box' or type == 'LineString':
location(xentry, (coordinates[0][0]+coordinates[1][0])/2.0, (coordinates[0][1]+coordinates[1][1])/2.0)
elif type == 'Polygon':
vertices = coordinates[0]
lats = [row[0] for row in vertices]
longs = [row[1] for row in vertices]
location(xentry, sum(lats)/float(len(lats)), sum(longs)/float(len(longs)))
if entry.has_key('geo_lat') and \
entry.has_key('geo_long'):
location(xentry, (float)(entry.get('geo_long',None)), (float)(entry.get('geo_lat',None)))
Expand Down Expand Up @@ -363,7 +367,8 @@ def reconstitute(feed, entry):
def entry_updated(feed, entry, default = None):
chks = ((entry, 'updated_parsed'),
(entry, 'published_parsed'),
(feed, 'updated_parsed'),)
(feed, 'updated_parsed'),
(feed, 'published_parsed'),)
for node, field in chks:
if node.has_key(field) and node[field]:
return node[field]
Expand Down
46 changes: 23 additions & 23 deletions planet/scrub.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,30 +122,30 @@ def scrub(feed_uri, data):
if entry.has_key('link'):
node['base'] = entry.link
else:
node['base'] = feedparser._urljoin(
node['base'] = feedparser.urls._urljoin(
node['base'], scrub_xmlbase)

node['value'] = feedparser._resolveRelativeURIs(
node['value'] = feedparser.urls._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)

# Run this through HTML5's sanitizer
doc = None
if 'xhtml' in node['type']:
try:
from xml.dom import minidom
doc = minidom.parseString(node['value'])
except:
node['type']='text/html'

if not doc:
from html5lib import html5parser, treebuilders
p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
doc = p.parseFragment(node['value'], encoding='utf-8')

from html5lib import treewalkers, serializer
from html5lib.filters import sanitizer
walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
tree = xhtml.serialize(walker, encoding='utf-8')

node['value'] = ''.join([str(token) for token in tree])
if node['value']:
# Run this through HTML5's sanitizer
doc = None
if 'xhtml' in node['type']:
try:
from xml.dom import minidom
doc = minidom.parseString(node['value'])
except:
node['type']='text/html'

if not doc:
from html5lib import html5parser, treebuilders, sanitizer
p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'), tokenizer=sanitizer.HTMLSanitizer)
doc = p.parseFragment(node['value'])
# doc = p.parseFragment(node['value'], encoding='utf-8')

from html5lib import treewalkers, serializer
walker = treewalkers.getTreeWalker('dom')(doc)
xhtml = serializer.HTMLSerializer(inject_meta_charset = False)
tree = xhtml.serialize(walker, encoding='utf-8')
node['value'] = ''.join([str(token) for token in tree])
4 changes: 2 additions & 2 deletions planet/shell/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ def run(template_file, doc, mode='template'):
module_name = ext[1:]
try:
try:
module = __import__("_" + module_name)
module = __import__("planet.shell." + "_" + module_name, "", "", [""])
except:
module = __import__(module_name)
module = __import__("planet.shell." + module_name, "", "", [""])
except Exception, inst:
return log.error("Skipping %s '%s' after failing to load '%s': %s",
mode, template_resolved, module_name, inst)
Expand Down
3 changes: 1 addition & 2 deletions planet/shell/dj.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,12 @@ def run(script, doc, output_file=None, options={}):
# I need to re-import the settings at every call because I have to
# set the TEMPLATE_DIRS variable programmatically
from django.conf import settings
settings._wrapped=None
try:
settings.configure(
DEBUG=True, TEMPLATE_DEBUG=True,
TEMPLATE_DIRS=(os.path.dirname(script),)
)
except EnvironmentError:
except RuntimeError:
pass
from django.template import Context
from django.template.loader import get_template
Expand Down
8 changes: 6 additions & 2 deletions planet/shell/tmpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import sgmllib, time, os, sys, new, urlparse, re
from planet import config, feedparser
import htmltmpl
import datetime

voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
# voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
voids=[]
empty=re.compile(r"<((%s)[^>]*)></\2>" % '|'.join(voids))

class stripHtml(sgmllib.SGMLParser):
Expand Down Expand Up @@ -186,7 +188,7 @@ def template_info(source):
""" get template information from a feedparser output """

# wire in support for planet:source, call feedparser, unplug planet:source
mixin=feedparser._FeedParserMixin
mixin=feedparser.mixin._FeedParserMixin
mixin._start_planet_source = mixin._start_source
mixin._end_planet_source = \
new.instancemethod(_end_planet_source, None, mixin)
Expand Down Expand Up @@ -253,6 +255,8 @@ def run(script, doc, output_file=None, options={}):
template = manager.prepare(script)
tp = htmltmpl.TemplateProcessor(html_escape=0)
for key,value in template_info(doc).items():
if type(value) == datetime.datetime:
value = value.isoformat()
tp.set(key, value)

if output_file:
Expand Down
18 changes: 9 additions & 9 deletions planet/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ def filename(directory, filename):
filename = re_final_cruft.sub("", filename)

# limit length of filename
if len(filename)>250:
if len(filename)>230:
parts=filename.split(',')
for i in range(len(parts),0,-1):
if len(','.join(parts[:i])) < 220:
if len(','.join(parts[:i])) < 200:
filename = ','.join(parts[:i]) + ',' + \
md5(','.join(parts[i:])).hexdigest()
break
Expand Down Expand Up @@ -107,7 +107,7 @@ def writeCache(feed_uri, feed_info, data):
if not feed_info.feed.has_key('planet_message'):
if feed_info.feed.has_key('planet_updated'):
updated = feed_info.feed.planet_updated
if feedparser._parse_date_iso8601(updated) >= activity_horizon:
if feedparser.datetimes.iso8601._parse_date_iso8601(updated) >= activity_horizon:
return
else:
if feed_info.feed.planet_message.startswith("no activity in"):
Expand Down Expand Up @@ -158,7 +158,7 @@ def writeCache(feed_uri, feed_info, data):
link['type'] = feedtype
break
else:
data.feed.links.append(feedparser.FeedParserDict(
data.feed.links.append(feedparser.util.FeedParserDict(
{'rel':'self', 'type':feedtype, 'href':feed_uri}))
for name, value in config.feed_options(feed_uri).items():
data.feed['planet_'+name] = value
Expand Down Expand Up @@ -226,7 +226,7 @@ def writeCache(feed_uri, feed_info, data):

# apply any filters
xdoc = reconstitute.reconstitute(data, entry)
output = xdoc.toxml().encode('utf-8')
output = xdoc.toxml("utf-8")
xdoc.unlink()
for filter in config.filters(feed_uri):
output = shell.run(filter, output, mode="filter")
Expand Down Expand Up @@ -257,7 +257,7 @@ def writeCache(feed_uri, feed_info, data):
data.feed['planet_updated'] = \
time.strftime("%Y-%m-%dT%H:%M:%SZ", updated[-1])
elif data.feed.has_key('planet_updated'):
updated = [feedparser._parse_date_iso8601(data.feed.planet_updated)]
updated = [feedparser.datetimes.iso8601._parse_date_iso8601(data.feed.planet_updated)]

if not updated or updated[-1] < activity_horizon:
msg = "no activity in %d days" % config.activity_threshold(feed_uri)
Expand Down Expand Up @@ -287,7 +287,7 @@ def writeCache(feed_uri, feed_info, data):
xdoc=minidom.parseString('''<feed xmlns:planet="%s"
xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
reconstitute.source(xdoc.documentElement,data.feed,data.bozo,data.version)
write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri))
write(xdoc.toxml("utf-8"), filename(sources, feed_uri))
xdoc.unlink()

def httpThread(thread_index, input_queue, output_queue, log):
Expand All @@ -301,7 +301,7 @@ def httpThread(thread_index, input_queue, output_queue, log):
feed = StringIO('')
setattr(feed, 'url', uri)
setattr(feed, 'headers',
feedparser.FeedParserDict({'status':'500'}))
feedparser.util.FeedParserDict({'status':'500'}))
try:
# map IRI => URI
try:
Expand Down Expand Up @@ -448,7 +448,7 @@ def spiderPlanet(only_if_new = False):

data = feedparser.parse(feed, **options)
else:
data = feedparser.FeedParserDict({'version': None,
data = feedparser.util.FeedParserDict({'version': None,
'headers': feed.headers, 'entries': [], 'feed': {},
'href': feed.url, 'bozo': 0,
'status': int(feed.headers.status)})
Expand Down
Loading