Skip to content

Commit

Permalink
call normalize correctly (closes #644)
Browse files Browse the repository at this point in the history
  • Loading branch information
mjpost committed Nov 12, 2019
1 parent 20d6c25 commit 6996d00
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 7 deletions.
4 changes: 2 additions & 2 deletions bin/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

from datetime import datetime

from normalize_anth import process
from normalize_anth import normalize
from anthology.utils import make_nested, make_simple_element, build_anthology_id, indent
from anthology.index import AnthologyIndex
from anthology.people import PersonName
Expand Down Expand Up @@ -84,7 +84,7 @@
# Normalize
for paper in root_being_added.findall('.//paper'):
for oldnode in paper:
process(oldnode, informat='xml')
normalize(oldnode, informat='latex')

# Ingest each volume.
# First, find the XML file.
Expand Down
20 changes: 15 additions & 5 deletions bin/normalize_anth.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,14 +121,24 @@ def decompose(c):

return s

def process(oldnode, informat):

def normalize(oldnode, informat):
"""
Receives an XML 'paper' node and normalizes many of its fields, including:
- Unescaping HTML
- Normalizing quotes and other punctuation
- Mapping many characters to unicode
In addition, if the 'informat' is "latex", it will convert many LaTeX characters
to unicode equivalents. Note that these latter LaTeX operations are not idempotent.
"""

if oldnode.tag in ['url', 'href', 'mrf', 'doi', 'bibtype', 'bibkey',
'revision', 'erratum', 'attachment', 'paper',
'presentation', 'dataset', 'software', 'video']:
return
elif oldnode.tag in ['author', 'editor']:
for oldchild in oldnode:
process(oldchild, informat=informat)
normalize(oldchild, informat=informat)
else:
if informat == "latex":
if len(oldnode) > 0:
Expand All @@ -138,7 +148,7 @@ def process(oldnode, informat):
newnode.tag = oldnode.tag
newnode.attrib.update(oldnode.attrib)
replace_node(oldnode, newnode)

maptext(oldnode, html.unescape)
maptext(oldnode, curly_quotes)
maptext(oldnode, clean_unicode)
Expand Down Expand Up @@ -170,6 +180,6 @@ def process(oldnode, informat):
paper.attrib['id'])
for oldnode in paper:
location = "{}:{}".format(papernum, oldnode.tag)
process(oldnode, informat=informat)
normalize(oldnode, informat=informat)

tree.write(args.outfile, encoding="UTF-8", xml_declaration=True)

0 comments on commit 6996d00

Please sign in to comment.