Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nested volumes and explicit <url> tags #324

Merged
merged 50 commits into from
Jun 21, 2019
Merged
Show file tree
Hide file tree
Changes from 47 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
1dda89a
added script, changed schema
mjpost May 9, 2019
ec103c5
fixed ACL 2018
mjpost May 9, 2019
6f3e58c
extended volume schema to add volume url
mjpost May 9, 2019
9247c59
Merge branch 'master' into consolidate_urls
mjpost May 10, 2019
b555d74
fixed schema
mjpost May 10, 2019
e159174
use <url> tag
mjpost May 10, 2019
df1cdd8
modularized to export functions
mjpost May 10, 2019
6496178
started volume work, postponing till #317
mjpost May 10, 2019
241d311
explicit <url> tag on all XML files
mjpost May 10, 2019
05513e7
updated schemas, added conversion script
mjpost May 11, 2019
d9849c3
Merge branch 'master' into hierarchical
mjpost Jun 1, 2019
2b666ce
Merge branch 'NAACL19' into hierarchical
mjpost Jun 1, 2019
0dfa73a
removed stray .
mjpost Jun 1, 2019
bf32ab2
root -> collection
mjpost Jun 1, 2019
2275cfc
fixed logic in URL rewriting
mjpost Jun 1, 2019
f0fbdfb
canonicalized URLs
mjpost Jun 1, 2019
330a756
fixed conflict in L06
mjpost Jun 1, 2019
150a88f
root -> collection
mjpost Jun 2, 2019
5448bbb
Merge branch 'master' into hierarchical
mjpost Jun 2, 2019
cc564a4
updated schema, added script, converted NAACL
mjpost Jun 3, 2019
a1ca0ad
reverted N19 conversion
mjpost Jun 6, 2019
d991fa4
Merge branch 'master' into consolidate_urls
mjpost Jun 6, 2019
f028e85
*** empty log message ***
mjpost Jun 7, 2019
af3c21f
booktitle, moved indent()
mjpost Jun 8, 2019
79e946d
restructured code to read nested format
mjpost Jun 8, 2019
8940383
expanding schema to accommodate long tail
mjpost Jun 10, 2019
d73a8e5
finessing canonical indent
mjpost Jun 10, 2019
ffa6f5c
code changes for yaml gen
mjpost Jun 10, 2019
bfe55ae
Merge branch 'master' into consolidate_urls
mjpost Jun 15, 2019
2056f80
fixed indenting
mjpost Jun 16, 2019
9afe93b
force booktitle
mjpost Jun 16, 2019
b65fe7d
special handling for C69 conversion
mjpost Jun 16, 2019
a2d05a1
bugfix in stripping during indentation
mjpost Jun 16, 2019
1bd5aae
typos, logic fixes with inheritance
mjpost Jun 16, 2019
3fc7953
safety check on paper_id len
mjpost Jun 16, 2019
2a1ff56
bugfix (weird clash with builtin 'formatter' class)
mjpost Jun 16, 2019
f008a5d
fix YAML volume title issue; only print volume URLs if present in XML
mjpost Jun 16, 2019
192fd33
Merge branch 'master' into consolidate_urls
mjpost Jun 16, 2019
3860b4d
converted to nested format
mjpost Jun 16, 2019
2db7f09
Revert "converted to nested format"
mjpost Jun 16, 2019
10a955a
fixed search for frontmatter URL
mjpost Jun 16, 2019
b73c03a
move full-volume DOIs from frontmatter to metadata
mjpost Jun 16, 2019
401b6aa
fixed bug in full paper id
mjpost Jun 16, 2019
6b28a51
remove booktitle from journals and frontmatter
mjpost Jun 17, 2019
fe33e02
disappear PDF button for frontmatter when no <url> tag
mjpost Jun 17, 2019
1ec399d
fixed DOI, frontmatter <url> tags
mjpost Jun 17, 2019
67d31f3
updated map to list volume ID
mjpost Jun 17, 2019
9962619
fixed hugo syntax
mjpost Jun 19, 2019
ace9e95
semanticized presence of frontmatter
mjpost Jun 19, 2019
555e4a0
missed a 'with' in the template
mjpost Jun 20, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
55 changes: 26 additions & 29 deletions bin/anthology/anthology.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,38 +54,35 @@ def import_directory(self, importdir):
for xmlfile in glob(importdir + "/xml/*.xml"):
self.import_file(xmlfile)
self.pindex.verify()

mjpost marked this conversation as resolved.
Show resolved Hide resolved
def import_file(self, filename):
tree = etree.parse(filename)
volume = tree.getroot()
top_level_id = volume.get("id")
if top_level_id in self.volumes:
log.critical(
"Attempted to import top-level ID '{}' twice".format(top_level_id)
)
log.critical("Triggered by file: {}".format(filename))
current_volume = None
for paper in volume:
parsed_paper = Paper.from_xml(paper, top_level_id, self.formatter)
self.pindex.register(parsed_paper)
full_id = parsed_paper.full_id
if full_id in self.papers:
collection = tree.getroot()
collection_id = collection.get("id")
for volume_xml in collection:
volume = Volume.from_xml(volume_xml, collection_id, self.venues, self.sigs, self.formatter)

if volume.full_id in self.volumes:
log.critical(
"Attempted to import paper '{}' twice -- skipping".format(full_id)
"Attempted to import volume ID '{}' twice".format(volume.full_id)
)
continue
if parsed_paper.is_volume:
if current_volume is not None:
self.volumes[current_volume.full_id] = current_volume
current_volume = Volume(parsed_paper, self.venues, self.sigs)
else:
if current_volume is None:
log.critical("Triggered by file: {}".format(filename))

# front matter
if len(volume.content):
mjpost marked this conversation as resolved.
Show resolved Hide resolved
front_matter = volume.content[0]
self.pindex.register(front_matter)
self.papers[front_matter.full_id] = front_matter

self.volumes[volume.full_id] = volume
for paper in volume_xml.findall('paper'):
parsed_paper = Paper.from_xml(paper, volume, self.formatter)
self.pindex.register(parsed_paper)
full_id = parsed_paper.full_id
if full_id in self.papers:
log.critical(
"First paper of XML should be volume entry, but '{}' is not interpreted as one".format(
full_id
)
"Attempted to import paper '{}' twice -- skipping".format(full_id)
)
current_volume.append(parsed_paper)
self.papers[full_id] = parsed_paper
if current_volume is not None:
self.volumes[current_volume.full_id] = current_volume
continue
volume.append(parsed_paper)
self.papers[full_id] = parsed_paper
4 changes: 4 additions & 0 deletions bin/anthology/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,12 @@
ANTHOLOGY_URL = "https://www.aclweb.org/anthology/{}"
ATTACHMENT_URL = "https://www.aclweb.org/anthology/attachments/{}"

# Names of XML elements that may appear multiple times
LIST_ELEMENTS = ("attachment", "author", "editor", "video", "revision", "erratum")

def get_journal_title(top_level_id, volume_title):
# TODO: consider moving this from code to data (perhaps
# under <booktitle> in the volume metadata
if top_level_id[0] == "J":
year = int(top_level_id[1:3])
if year >= 65 and year <= 83:
Expand Down
203 changes: 79 additions & 124 deletions bin/anthology/papers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
# limitations under the License.

import logging as log
from .people import PersonName
from .utils import (
parse_element,
infer_attachment_url,
remove_extra_whitespace,
is_journal,
Expand All @@ -28,25 +28,73 @@
# For BibTeX export
from .formatter import bibtex_encode, bibtex_make_entry

# Names of XML elements that may appear multiple times
_LIST_ELEMENTS = ("attachment", "author", "editor", "video", "revision", "erratum")


class Paper:
def __init__(self, paper_id, top_level_id, formatter):
def __init__(self, paper_id, volume, formatter):
self.parent_volume = volume
self.formatter = formatter
self.parent_volume = None
self.paper_id = paper_id
self.top_level_id = top_level_id
self.attrib = {}
self._id = paper_id
self._bibkey = False
self.is_volume = paper_id == '0'

# initialize metadata with keys inherited from volume
self.attrib = {}
for key, value in volume.attrib.items():
# Only inherit 'editor' for frontmatter
if (key == 'editor' and not self.is_volume) or key in ('collection_id', 'booktitle', 'id', 'meta_data', 'meta_journal_title', 'meta_volume', 'meta_issue', 'sigs', 'venues', 'meta_date', 'url'):
continue

self.attrib[key] = value

def from_xml(xml_element, *args):
paper = Paper(xml_element.get("id"), *args)
paper._parse_element(xml_element)
# Default to paper ID "0" (for front matter)
paper = Paper(xml_element.get("id", '0'), *args)

# Set values from parsing the XML element (overwriting
# and changing some initialized from the volume metadata)
for key, value in parse_element(xml_element).items():
if key == 'author' and 'editor' in paper.attrib:
del paper.attrib['editor']
paper.attrib[key] = value

# Frontmatter title is the volume 'booktitle'
if paper.is_volume:
paper.attrib['xml_title'] = paper.attrib['xml_booktitle']
paper.attrib['xml_title'].tag = 'title'

# Remove booktitle for frontmatter and journals
if paper.is_volume or is_journal(paper.full_id):
del paper.attrib['xml_booktitle']

# Expand URLs with paper ID
for tag in ('revision', 'erratum'):
if tag in paper.attrib:
for item in paper.attrib[tag]:
if not item['url'].startswith(paper.full_id):
log.error(
"{} must begin with paper ID '{}', but is '{}'".format(
tag, paper.full_id, item['url']
)
)
item['url'] = data.ANTHOLOGY_URL.format(item['url'])

if 'attachment' in paper.attrib:
for item in paper.attrib['attachment']:
item['url'] = infer_attachment_url(item['url'], paper.full_id)

# Explicitly construct URL of original version of the paper
# -- this is a bit hacky, but it's not given in the XML
# explicitly
if 'revision' in paper.attrib:
paper.attrib['revision'].insert(0, {
"value": "{}v1".format(paper.full_id),
"id": "1",
"url": data.ANTHOLOGY_URL.format( "{}v1".format(paper.full_id)) } )


paper.attrib["title"] = paper.get_title("plain")
if "booktitle" in paper.attrib:
paper.attrib["booktitle"] = paper.get_booktitle("plain")

if "editor" in paper.attrib:
if paper.is_volume:
if "author" in paper.attrib:
Expand All @@ -64,116 +112,13 @@ def from_xml(xml_element, *args):
paper.full_id
)
)
if "year" not in paper.attrib:
paper._infer_year()
if "pages" in paper.attrib:
if paper.attrib["pages"] is not None:
paper._interpret_pages()
else:
del paper.attrib["pages"]
return paper

def _parse_element(self, paper_element):
# read & store values
if "href" in paper_element.attrib:
self.attrib["attrib_href"] = paper_element.get("href")
self.attrib["url"] = paper_element.get("href")
elif not (self.is_volume and is_journal(self.full_id)):
# Generate a URL, except for top-level journal entries
self.attrib["url"] = data.ANTHOLOGY_URL.format(self.full_id)
for element in paper_element:
# parse value
tag = element.tag.lower()
if tag in ("abstract", "title", "booktitle"):
tag = "xml_{}".format(tag)
value = element
elif tag == "attachment":
value = {
"filename": element.text,
"type": element.get("type", "attachment"),
"url": infer_attachment_url(element.text, self.full_id),
}
elif tag in ("author", "editor"):
id_ = element.attrib.get("id", None)
value = (PersonName.from_element(element), id_)
elif tag in ("erratum", "revision"):
if tag == "revision" and "revision" not in self.attrib:
# Explicitly construct URL of original version of the paper
# -- this is a bit hacky, but it's not given in the XML
# explicitly
self.attrib["revision"] = [
{
"value": "{}v1".format(self.full_id),
"id": "1",
"url": data.ANTHOLOGY_URL.format(
"{}v1".format(self.full_id)
),
}
]
if not element.text.startswith(self.full_id):
log.error(
"{} must begin with paper ID '{}', but is '{}'".format(
tag, self.full_id, element.text
)
)
value = {
"value": element.text,
"id": element.get("id"),
"url": data.ANTHOLOGY_URL.format(element.text),
}
elif tag == "mrf":
value = {"filename": element.text, "src": element.get("src")}
elif tag == "video":
# Treat videos the same way as other attachments
tag = "attachment"
value = {
"filename": element.get("href"),
"type": element.get("tag", "video"),
"url": infer_attachment_url(element.get("href"), self.full_id),
}
elif tag in ("dataset", "software"):
value = {
"filename": element.text,
"type": tag,
"url": infer_attachment_url(element.text, self.full_id),
}
tag = "attachment"
else:
value = element.text
# store value
if tag == "url":
continue # We basically have to ignore this for now
if tag in _LIST_ELEMENTS:
try:
self.attrib[tag].append(value)
except KeyError:
self.attrib[tag] = [value]
else:
if tag in self.attrib:
log.warning(
"{}: Unexpected multiple occurrence of '{}' element".format(
self.full_id, tag
)
)
self.attrib[tag] = value

def _infer_year(self):
"""Infer the year from the volume ID.

Many paper entries do not explicitly contain their year. This function assumes
that the paper's volume identifier follows the format 'xyy', where x is
some letter and yy are the last two digits of the year of publication.
"""
assert (
len(self.top_level_id) == 3
), "Couldn't infer year: unknown volume ID format"
digits = self.top_level_id[1:]
if int(digits) >= 60:
year = "19{}".format(digits)
else:
year = "20{}".format(digits)
self.attrib["year"] = year

def _interpret_pages(self):
"""Splits up 'pages' field into first and last page, if possible.

Expand All @@ -187,18 +132,28 @@ def _interpret_pages(self):
return

@property
def is_volume(self):
"""Determines if this paper is a regular paper or a proceedings volume.
def collection_id(self):
return self.parent_volume.collection_id

By default, each paper ID of format 'x000' will be treated as (the front
matter of) a proceedings volume, unless the XML is of type workshop,
where each paper ID of format 'xx00' is treated as one volume.
"""
return is_volume_id(self.full_id)
@property
def volume_id(self):
return self.parent_volume.volume_id

@property
def paper_id(self):
if self.collection_id[0] == "W" or self.collection_id == "C69":
# If volume is a workshop, use the last two digits of ID
_id = "{}{:02d}".format(self.volume_id, int(self._id))
else:
# If not, only the last three
_id = "{}{:03d}".format(self.volume_id, int(self._id))
# Just to be sure
assert len(_id) == 4
return _id
mjpost marked this conversation as resolved.
Show resolved Hide resolved

@property
def full_id(self):
return "{}-{}".format(self.top_level_id, self.paper_id)
return "{}-{}".format(self.collection_id, self.paper_id)

@property
def bibkey(self):
Expand Down Expand Up @@ -290,7 +245,7 @@ def as_bibtex(self):
("booktitle", self.parent_volume.get_title(form="latex"))
)
for entry in ("month", "year", "address", "publisher", "note"):
if entry in self.attrib:
if self.get(entry) is not None:
entries.append((entry, bibtex_encode(self.get(entry))))
for entry in ("url", "doi"):
if entry in self.attrib:
Expand Down
6 changes: 3 additions & 3 deletions bin/anthology/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# limitations under the License.

import logging as log
from .formatter import bibtex_encode
import anthology.formatter as my_formatter


class PersonName:
Expand Down Expand Up @@ -60,8 +60,8 @@ def id_(self):

def as_bibtex(self):
if not self.first:
return "{{{}}}".format(bibtex_encode(self.last))
return bibtex_encode("{}, {}".format(self.last, self.first))
return "{{{}}}".format(my_formatter.bibtex_encode(self.last))
return my_formatter.bibtex_encode("{}, {}".format(self.last, self.first))

def as_dict(self):
return {"first": self.first, "last": self.last, "full": self.full}
Expand Down
Loading