Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[For merging on 30.09] landing page is canonical #513

Merged
merged 12 commits into from
Oct 1, 2019
5 changes: 5 additions & 0 deletions bin/anthology/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
ANTHOLOGY_PREFIX = "https://www.aclweb.org/anthology"

ANTHOLOGY_URL = ANTHOLOGY_PREFIX + '/{}'
ANTHOLOGY_PDF = ANTHOLOGY_PREFIX + '/{}.pdf'
ANTHOLOGY_BIB = ANTHOLOGY_PREFIX + '/{}.bib'
ANTHOLOGY_ENDNOTE = ANTHOLOGY_PREFIX + '/{}.endf'
ANTHOLOGY_MODS = ANTHOLOGY_PREFIX + '/{}.xml'

ATTACHMENT_URL = ANTHOLOGY_PREFIX + '/attachments/{}'

# Names of XML elements that may appear multiple times
Expand Down
15 changes: 13 additions & 2 deletions bin/anthology/papers.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ def from_xml(xml_element, *args):
# Default to paper ID "0" (for front matter)
paper = Paper(xml_element.get("id", '0'), ingest_date, *args)

# The landing page URL inside the Anthology. The XML file contains <url> tags
# for papers that have a PDF, but the XML reading code swaps this to 'pdf',
# so we can safely use 'url' here for the landing page.
page_url = data.ANTHOLOGY_URL.format(paper.full_id)
paper.attrib['url'] = page_url

# Set values from parsing the XML element (overwriting
# and changing some initialized from the volume metadata)
for key, value in parse_element(xml_element).items():
Expand All @@ -77,7 +83,12 @@ def from_xml(xml_element, *args):
tag, paper.full_id, item['url']
)
)
item['url'] = data.ANTHOLOGY_URL.format(item['url'])
item['url'] = data.ANTHOLOGY_PDF.format(item['url'])

# Add explicit links to bib, mods XML, and Endnote
paper.attrib['bib'] = data.ANTHOLOGY_BIB.format(paper.full_id)
paper.attrib['endf'] = data.ANTHOLOGY_ENDNOTE.format(paper.full_id)
paper.attrib['mods'] = data.ANTHOLOGY_MODS.format(paper.full_id)

if 'attachment' in paper.attrib:
for item in paper.attrib['attachment']:
Expand All @@ -90,7 +101,7 @@ def from_xml(xml_element, *args):
paper.attrib['revision'].insert(0, {
"value": "{}v1".format(paper.full_id),
"id": "1",
"url": data.ANTHOLOGY_URL.format( "{}v1".format(paper.full_id)) } )
"url": data.ANTHOLOGY_URL.format( "{}v1.pdf".format(paper.full_id)) } )
mjpost marked this conversation as resolved.
Show resolved Hide resolved

paper.attrib["title"] = paper.get_title("plain")
if "booktitle" in paper.attrib:
Expand Down
5 changes: 4 additions & 1 deletion bin/anthology/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,11 @@ def parse_element(xml_element):
value = element.text

if tag == "url":
# Use the tag 'pdf' instead of 'url'
tag = 'pdf'

# Convert relative URLs to canonical ones
value = element.text if element.text.startswith('http') else data.ANTHOLOGY_URL.format(element.text)
value = element.text if element.text.startswith('http') else data.ANTHOLOGY_PDF.format(element.text)

if tag in data.LIST_ELEMENTS:
try:
Expand Down
9 changes: 3 additions & 6 deletions hugo/layouts/papers/list-entry.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,14 @@
{{ $paper := index (index $.Site.Data.papers $volume_id) .Params.anthology_id }}
<p class="d-sm-flex align-items-stretch">
<span class="d-block mr-2 text-nowrap list-button-row">
{{- with $paper.url -}}
{{- with $paper.pdf -}}
<a class="badge badge-primary align-middle mr-1" href="{{ . }}" data-toggle="tooltip" data-placement="top" title="Open PDF">
pdf
mjpost marked this conversation as resolved.
Show resolved Hide resolved
</a>
{{- if and (hasPrefix . $.Site.Params.baseURL) (eq . (strings.TrimSuffix ".pdf" .)) -}}
<a class="d-none" href="{{ . }}.pdf" title="Hidden link to PDF with extension">pdf</a>
{{- end -}}
{{- end -}}
mbollmann marked this conversation as resolved.
Show resolved Hide resolved
{{- $bibfile := printf "/papers/%s/%s/%s.bib" (slicestr $volume_id 0 1) $volume_id .Params.anthology_id -}}
{{- if (fileExists (printf "/data-export/%s" $bibfile)) -}}
<a class="badge badge-secondary align-middle mr-1" href="{{ $bibfile | relURL }}" data-toggle="tooltip" data-placement="top" title="Export to BibTeX">
<a class="badge badge-secondary align-middle mr-1" href="{{ $paper.bib }}" data-toggle="tooltip" data-placement="top" title="Export to BibTeX">
bib
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The bib path is now ${ANTH_ID}.bib, e.g., https://aclweb.org/anthology/P19-1002.bib, instead of the nested version.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're no longer using $bibfile, but the link is still within an if clause that checks for $bibfile.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine though, right? If the bibfile exists, we generate the appropriate link to it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't change this because I'm not sure how to fix it. If the bibfile is present, we generate the canonical link to it, which is handled by a redirect internally.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, the question is whether we still want a check for the existence of the file in the Hugo template.

If we do, then this current solution is suboptimal IMO since it requires us manually remembering and updating the actual local path to the file (in the $bibfile := ... line), in case it ever changes.

If we do not, we should just throw out the surrounding lines altogether.

</a>
{{- end -}}
Expand All @@ -28,7 +25,7 @@
{{ if eq hugo.Environment "development" }}
<span class="badge badge-light align-middle">{{ .Params.anthology_id }}</span>
{{ end }}
<strong><a class="align-middle" href="{{ .RelPermalink }}">{{ $paper.title_html | safeHTML }}</a></strong>
<strong><a class="align-middle" href="{{ $paper.url }}/">{{ $paper.title_html | safeHTML }}</a></strong>
<br />
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Link to the canonical page (https://aclweb.org/anthology/P19-1002/) instead of the nested version. This will get rid of the double search results.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same concern as above: URL can refer to external content.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. url now points to the page's Anthology URL; pdf points to the PDF (which can be internal or external).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The downside of how this is now handled is that it completely breaks local browsing of the site, since links always point to the live anthology.

I see two options here:

  1. Making the behaviour dependent on Hugo's "environment" variable, using .RelPermalink when in development mode and $paper.url otherwise. I don't really like this, as diverging behaviour has the potential to hide bugs.

  2. Generating the flat structure directly with Hugo, as discussed in Should the canonical URL be the landing page? #480, instead of relying on redirects.

{{ with $paper.author }}
{{ $len := (len $paper.author) }}
Expand Down
12 changes: 6 additions & 6 deletions hugo/layouts/papers/single.html
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,15 @@ <h5 class="card-title">Abstract</h5>
<dt class="acl-button-row">Bib Export formats:</dt>
<dd class="acl-button-row">
{{ if (fileExists (printf "/data-export/%s" $bibfile)) }}
<a class="btn btn-secondary btn-sm" href="{{ $bibfile | relURL }}">BibTeX</a>
<a class="btn btn-secondary btn-sm" href="{{ $paper.bib }}">BibTeX</a>
{{ end }}
{{ $expfile := printf "/papers/%s/%s/%s.xml" (slicestr $volume_id 0 1) $volume_id $anthology_id }}
{{ if (fileExists (printf "/data-export/%s" $expfile)) }}
<a class="btn btn-secondary btn-sm" href="{{ $expfile | relURL }}">MODS XML</a>
<a class="btn btn-secondary btn-sm" href="{{ $paper.mods }}">MODS XML</a>
{{ end }}
{{ $endfile := printf "/papers/%s/%s/%s.endf" (slicestr $volume_id 0 1) $volume_id $anthology_id }}
{{ if (fileExists (printf "/data-export/%s" $endfile)) }}
<a class="btn btn-secondary btn-sm" href="{{ $endfile | relURL }}">EndNote</a>
<a class="btn btn-secondary btn-sm" href="{{ $paper.endf }}">EndNote</a>
{{ end }}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use the canonical URL prefix instead of the deeply nested prefix.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same concern as with $bibfile above.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So do you suggest that we generate bib, endf, etc links in the YAML, and only use those if present?

Copy link
Member Author

@mjpost mjpost Sep 22, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what I ended up doing for clarity.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is nice, but has the same issue with local browsing–the links will always point to the live server.

{{ if (fileExists (printf "/data-export/%s" $bibfile)) }}
<button type="button" class="btn btn-clipboard btn-secondary btn-sm d-none" data-clipboard-text="{{ readFile (printf "/data-export/%s" $bibfile) }}"><i class="far fa-clipboard pr-2"></i>Copy BibTeX to Clipboard</button>
Expand Down Expand Up @@ -179,14 +179,14 @@ <h5 class="card-title">Abstract</h5>
</a>
{{ end }}
{{ else }}
{{ with $paper.url }}
<a class="btn btn-primary" href="{{ . }}" title="Open PDF of '{{ $paper.title | htmlEscape }}'">
{{ with $paper.pdf }}
<a class="btn btn-primary" href="{{ . }}" title="Open PDF of '{{ $paper.title | htmlEscape }}'">
mjpost marked this conversation as resolved.
Show resolved Hide resolved
<i class="far fa-file-pdf"></i><span class="pl-2">PDF</span>
</a>
{{ end }}
{{ end }}
{{ if (fileExists (printf "/data-export/%s" $bibfile)) }}
<a class="btn btn-secondary" href="{{ $bibfile | relURL }}" title="Export '{{ $paper.title | htmlEscape }}' to bib format">
<a class="btn btn-secondary" href="{{ $paper.bib }}" title="Export '{{ $paper.title | htmlEscape }}' to bib format">
<i class="fas fa-file-export"></i><span class="pl-2 transform-lower-sm">Bib</span><span class="d-none d-sm-inline">TeX</span>
</a>
{{ end }}
Expand Down
8 changes: 2 additions & 6 deletions hugo/static/.htaccess
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ RewriteRule ^papers/[A-Za-z]/[A-Za-z][0-9][0-9]/([A-Za-z])([0-9][0-9])\-([0-9][0
#

## PDF redirection
# Canonical URL (a plain ACL ID with no file extension, e.g., P17-1069 loads P/P17/P17-1069.pdf)
RewriteRule ^([A-Za-z])([0-9][0-9])\-([0-9][0-9][0-9][0-9])$ /anthology-files/pdf/$1/$1$2/$1$2-$3.pdf [L,NC]
# Canonical URL (a plain ACL ID with no file extension) -> landing page
RewriteRule ^([A-Za-z])([0-9][0-9])\-([0-9][0-9][0-9][0-9])\/?$ papers/$1/$1$2/$1$2-$3/ [L,NC]

# Volume URLs (e.g., P17-1 loads P/P17/P17-1.pdf)
RewriteRule ^([A-Za-z])([0-9][0-9])\-([0-9]{1,2})$ /anthology-files/pdf/$1/$1$2/$1$2-$3.pdf [L,NC]
Expand All @@ -68,10 +68,6 @@ RewriteRule ^([A-Za-z])([0-9][0-9])\-([0-9][0-9][0-9][0-9])([ve][0-9]+)$ /anthol
# Attachments (e.g., P17-1069.Poster.pdf loads /anthology-files/attachments/P/P17/P17-1069.Poster.pdf)
RewriteRule ^attachments/([A-Za-z])([0-9][0-9])\-([0-9][0-9][0-9][0-9])(\..*)?$ /anthology-files/attachments/$1/$1$2/$1$2-$3$4 [L,NC]

## Paper and author pages and bibtex
# The Paper metadata page (e.g., P17-1069/ loads papers/P/P17/P17-1069/index.html)
RewriteRule ^([A-Za-z])([0-9][0-9])\-([0-9][0-9][0-9][0-9])\/$ papers/$1/$1$2/$1$2-$3/index.html [L,NC]

# Redirects for bib, MODS XML, Endnote (e.g., /P17-1069.bib loads papers/P/P17/P17-1069.bib)
RewriteRule ^([A-Za-z])([0-9][0-9])\-([0-9][0-9][0-9][0-9])\.([a-z]+)$ papers/$1/$1$2/$1$2-$3.$4 [L,NC]

Expand Down