Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Author/editor id attribute #297

Merged
merged 14 commits into from
May 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion bin/anthology/anthology.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ def import_directory(self, importdir):
self.load_schema(importdir + "/xml/schema.rng")
for xmlfile in glob(importdir + "/xml/*.xml"):
self.import_file(xmlfile)

self.pindex.verify()

def import_file(self, filename):
tree = etree.parse(filename)
if self.schema is not None:
Expand Down
197 changes: 114 additions & 83 deletions bin/anthology/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,43 +45,52 @@ def __init__(self, parent, srcdir=None):
self._parent = parent
self.bibkeys = set()
self.stopwords = load_stopwords("en")
self.canonical = defaultdict(list) # maps canonical names to variants
self.variants = {} # maps variant names to canonical names
self._all_slugs = set([""])
self.slugs = {} # maps names to unique slugs
self.coauthors = defaultdict(Counter) # maps names to co-author names
self.papers = defaultdict(lambda: defaultdict(list))
self.id_to_canonical = {} # maps ids to canonical names
self.id_to_used = defaultdict(set) # maps ids to all names actually used
self.name_to_ids = defaultdict(list) # maps canonical/variant names to ids
self.coauthors = defaultdict(Counter) # maps ids to co-author ids
self.id_to_papers = defaultdict(lambda: defaultdict(list)) # id -> role -> papers
self.name_to_papers = defaultdict(lambda: defaultdict(list)) # name -> (explicit id?) -> papers; used only for error checking
if srcdir is not None:
self.load_variant_list(srcdir)

def load_variant_list(self, directory):
with open("{}/yaml/name_variants.yaml".format(directory), "r") as f:
name_list = yaml.load(f, Loader=Loader)

# Reserve ids for people with explicit ids in variant list
for entry in name_list:
if "id" in entry:
id_ = entry["id"]
canonical = entry["canonical"]
canonical = PersonName.from_dict(canonical)
self.set_canonical_name(id_, canonical)
for entry in name_list:
try:
canonical = entry["canonical"]
variants = entry["variants"]
variants = entry.get("variants", [])
id_ = entry.get("id", None)
except (KeyError, TypeError):
log.error("Couldn't parse name variant entry: {}".format(entry))
continue
canonical = PersonName.from_dict(canonical)
_ = self.papers[
canonical
] # insert empty entry for canonical if not present
if id_ is None:
id_ = self.fresh_id(canonical)
self.set_canonical_name(id_, canonical)
for variant in variants:
variant = PersonName.from_dict(variant)
_ = self.papers[variant] # insert empty entry if not present
if variant in self.variants:
if variant in self.name_to_ids:
log.error(
"Tried to add '{}' as variant of '{}', but is already a variant of '{}'".format(
repr(variant),
repr(canonical),
repr(self.variants[variant]),
repr(self.id_to_canonical[
self.name_to_ids[variant][0]
]),
)
)
continue
self.variants[variant] = canonical
self.canonical[canonical].append(variant)
self.add_variant_name(id_, variant)

def _is_stopword(self, word, paper):
"""Determines if a given word should be considered a stopword for
Expand Down Expand Up @@ -133,9 +142,9 @@ def create_bibkey(self, paper):
names = paper.get("editor", [])
if names:
if len(names) > BIBKEY_MAX_NAMES:
bibnames = "{}-etal".format(slugify(names[0].last))
bibnames = "{}-etal".format(slugify(names[0][0].last))
else:
bibnames = "-".join(slugify(n.last) for n in names)
bibnames = "-".join(slugify(n.last) for n, _ in names)
else:
bibnames = "nn"
title = [
Expand Down Expand Up @@ -171,89 +180,111 @@ def register(self, paper):
)
paper.bibkey = self.create_bibkey(paper)
for role in ("author", "editor"):
for name in paper.get(role, []):
for name, id_ in paper.get(role, []):
if id_ is None:
if len(self.name_to_ids.get(name, [])) > 1:
log.error("Paper {} uses ambiguous name '{}' without id".format(paper.full_id, name))
log.error(" Please add an id, for example: {}".format(" ".join(self.name_to_ids[name])))
id_ = self.resolve_name(name)["id"]
explicit = False
else:
if id_ not in self.id_to_canonical:
log.error("Paper {} uses name '{}' with id '{}' that does not exist".format(paper.full_id, name, id_))
explicit = True

self.id_to_used[id_].add(name)
# Register paper
self.papers[name][role].append(paper.full_id)
# Make sure canonical names are prioritized for slugs
if self.is_canonical(name):
self.get_slug(name)
self.id_to_papers[id_][role].append(paper.full_id)
self.name_to_papers[name][explicit].append(paper.full_id)
# Register co-author(s)
for author in paper.get(role):
if author != name:
self.coauthors[name][author] += 1

def names(self):
return self.papers.keys()

def __len__(self):
return len(self.papers)

def is_canonical(self, name):
return name not in self.variants
for co_name, co_id in paper.get(role):
if co_id is None:
co_id = self.resolve_name(co_name)["id"]
if co_id != id_:
self.coauthors[id_][co_id] += 1

def has_variants(self, name):
return name in self.canonical
def verify(self):
for name, ids in self.name_to_ids.items():
for id_ in ids:
cname = self.id_to_canonical[id_]
if name != cname and name not in self.id_to_used[id_]:
log.warning(
"Variant name '{}' of '{}' is not used".format(
repr(name),
repr(cname)))
for name, d in self.name_to_papers.items():
if len(d[False]) > 0 and len(d[True]) > 0:
log.error("Name '{}' is used both with and without explicit id".format(repr(name)))
log.error(
" Please add an id to paper(s): {}".format(
" ".join(d[False])
)
)
log.error(
" Or remove the id from paper(s): {}".format(
" ".join(d[True])
)
)

def personids(self):
return self.id_to_canonical.keys()

def get_canonical_variant(self, name):
"""Maps a name to its canonical variant."""
return self.variants.get(name, name)
def get_canonical_name(self, id_):
return self.id_to_canonical[id_]

def get_all_variants(self, name):
"""Return a list of all variants for a given name.
def set_canonical_name(self, id_, name):
if id_ in self.id_to_canonical:
log.error("Person id '{}' is used by both '{}' and '{}'".format(id_, name, self.id_to_canonical[id_]))
self.id_to_canonical[id_] = name
self.name_to_ids[name].append(id_)

Includes the supplied name itself.
"""
if not self.is_canonical(name):
name = self.get_canonical_variant(name)
return self.canonical[name] + [name]
def add_variant_name(self, id_, name):
self.name_to_ids[name].append(id_)

def get_registered_variants(self, name):
"""Return a list of variants for a given name that are actually
associated with papers.
def get_used_names(self, id_):
"""Return a list of all names used for a given person."""
return self.id_to_used[id_]

Will only return true variants, not including the canonical name.
"""
if not self.is_canonical(name):
name = self.get_canonical_variant(name)
return [n for n in self.canonical[name] if n in self.papers]
def resolve_name(self, name, id_=None):
"""Find person named 'name' and return a dict with fields
'first', 'last', 'id'"""
if id_ is None:
if name not in self.name_to_ids:
id_ = self.fresh_id(name)
self.set_canonical_name(id_, name)
else:
ids = self.name_to_ids[name]
assert len(ids) > 0
if len(ids) > 1:
log.debug("Name '{}' is ambiguous between {}".format(
repr(name),
', '.join("'{}'".format(i) for i in ids)
))
id_ = ids[0]
d = name.as_dict()
d["id"] = id_
return d

def get_slug(self, name):
if name in self.slugs:
return self.slugs[name]
def fresh_id(self, name):
assert name not in self.name_to_ids, name
slug, i = slugify(repr(name)), 0
while slug in self._all_slugs:
while slug == "" or slug in self.id_to_canonical:
i += 1
slug = "{}{}".format(slugify(repr(name)), i)
self._all_slugs.add(slug)
self.slugs[name] = slug
return slug

def get_papers(self, name, role=None, include_variants=False):
if include_variants:
return [
p
for n in self.get_all_variants(name)
for p in self.get_papers(n, role=role)
]
def get_papers(self, id_, role=None):
if role is None:
return [p for p_list in self.papers[name].values() for p in p_list]
return self.papers[name][role]
return [p for p_list in self.id_to_papers[id_].values() for p in p_list]
return self.id_to_papers[id_][role]

def get_coauthors(self, name, include_variants=False):
if include_variants:
return [
p for n in self.get_all_variants(name) for p in self.get_coauthors(n)
]
return self.coauthors[name].items()
def get_coauthors(self, id_):
return self.coauthors[id_].items()

def get_venues(self, vidx: VenueIndex, name, include_variants=False):
def get_venues(self, vidx: VenueIndex, id_):
"""Get a list of venues a person has published in, with counts."""
venues = Counter()
if include_variants:
for n in self.get_all_variants(name):
venues.update(self.get_venues(vidx, n))
else:
for paper in self.get_papers(name):
for venue in vidx.get_associated_venues(paper):
venues[venue] += 1
for paper in self.get_papers(id_):
for venue in vidx.get_associated_venues(paper):
venues[venue] += 1
return venues
5 changes: 3 additions & 2 deletions bin/anthology/papers.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ def _parse_element(self, paper_element):
"url": infer_attachment_url(element.text),
}
elif tag in ("author", "editor"):
value = PersonName.from_element(element)
id_ = element.attrib.get("id", None)
value = (PersonName.from_element(element), id_)
elif tag in ("erratum", "revision"):
if tag == "revision" and "revision" not in self.attrib:
# Explicitly construct URL of original version of the paper
Expand Down Expand Up @@ -258,7 +259,7 @@ def as_bibtex(self):
for people in ("author", "editor"):
if people in self.attrib:
entries.append(
(people, " and ".join(p.as_bibtex() for p in self.get(people)))
(people, " and ".join(p.as_bibtex() for p, _ in self.get(people)))
)
if is_journal(self.full_id):
entries.append(
Expand Down
59 changes: 0 additions & 59 deletions bin/check_name_variants.py

This file was deleted.

3 changes: 0 additions & 3 deletions bin/create_hugo_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,6 @@ def create_people(srcdir, clean=False):
data = yaml.load(f, Loader=Loader)
# Create a page stub for each person
for name, entry in data.items():
# Only create page stub when name doesn't link to a canonical entry
if "canonical_entry" in entry:
continue
person_dir = "{}/content/people/{}".format(srcdir, name[0])
if not os.path.exists(person_dir):
os.makedirs(person_dir)
Expand Down
Loading