Skip to content

Commit

Permalink
feat: including generator meta tag in .readalong files
Browse files Browse the repository at this point in the history
test: fixing ci
  • Loading branch information
deltork committed Jun 26, 2024
1 parent 05d8a9b commit b1cc821
Show file tree
Hide file tree
Showing 30 changed files with 211 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,5 +120,5 @@ jobs:
# This test will fail if the output encoding is cp1252
# Warning: the diff line below is PowerShell syntax, not bash!
run: |
echo ćś | readalongs make-xml -l fra - - > cs.readalong
echo ćś | readalongs make-xml -l fra - - | findstr /v meta > cs.readalong
if (diff (cat cs.readalong) (cat test/data/cs-ref.readalong)) { throw "Output did not match reference" }
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,7 @@ $RECYCLE.BIN/
*.lnk

# End of https://www.gitignore.io/api/linux,macos,python,windows,visualstudiocode

#temporary file
.tmp
.conda
2 changes: 1 addition & 1 deletion readalongs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import sys

VERSION = "1.0"
VERSION = "1.1"

if sys.version_info < (3, 8, 0): # pragma: no cover
sys.exit(
Expand Down
2 changes: 1 addition & 1 deletion readalongs/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.20240426"
__version__ = "1.1.20240619"
15 changes: 10 additions & 5 deletions readalongs/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from pympi.Praat import TextGrid
from webvtt import Caption, WebVTT

from readalongs import VERSION
from readalongs._version import __version__
from readalongs.audio_utils import (
extract_section,
mute_section,
Expand Down Expand Up @@ -175,7 +177,7 @@ def parse_and_make_xml(
"""Parse XML input and run tokenization and G2P.
Args:
xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.0.dtd)
xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.1.dtd)
config (dict): Optional; ReadAlong-Studio configuration to use
save_temps (str): Optional; Save temporary files, by default None
verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings
Expand Down Expand Up @@ -568,7 +570,7 @@ def align_audio(
"""Align an XML input file to an audio file.
Args:
xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.0.dtd)
xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.1.dtd)
audio_path (str): Path to audio input. Must be in a format supported by ffmpeg
unit (str): Optional; Element to create alignments for, by default 'w'
bare (boolean): Optional;
Expand Down Expand Up @@ -1156,7 +1158,8 @@ def convert_to_xhtml(tokenized_xml, title="Book"):

# TODO: add this <!-- DO NOT USE THIS DATA WITHOUT EXPLICIT PERMISSION --> to template
RAS_TEMPLATE = """<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}" />
<text xml:lang="{{main_lang}}" fallback-langs="{{fallback_langs}}">
<body>
{{#pages}}
Expand All @@ -1177,7 +1180,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"):


def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
"""Create input xml in ReadAlong XML format (see static/read-along-1.0.dtd)
"""Create input xml in ReadAlong XML format (see static/read-along-1.1.dtd)
Uses the line sequence to infer paragraph and sentence structure from plain text:
Assumes a double blank line marks a page break, and a single blank line
marks a paragraph break.
Expand All @@ -1194,6 +1197,8 @@ def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) ->
kwargs = {
"main_lang": text_languages[0],
"fallback_langs": ",".join(text_languages[1:]),
"studio_version": __version__,
"format_version": VERSION,
}
pages: List[dict] = []
paragraphs: List[dict] = []
Expand Down Expand Up @@ -1223,7 +1228,7 @@ def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) ->


def create_input_ras(**kwargs):
"""Create input xml in ReadAlong XML format (see static/read-along-1.0.dtd)
"""Create input xml in ReadAlong XML format (see static/read-along-1.1.dtd)
Uses readlines to infer paragraph and sentence structure from plain text.
Assumes a double blank line marks a page break, and a single blank line
marks a paragraph break.
Expand Down
103 changes: 103 additions & 0 deletions readalongs/static/read-along-1.1.dtd
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
<!-- VERSION: 1.1 -->
<!ELEMENT read-along (meta|text|body|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST read-along
use-assets-folder CDATA #IMPLIED
href CDATA #IMPLIED
audio CDATA #IMPLIED
xml:lang CDATA #IMPLIED
language CDATA #IMPLIED
lang CDATA #IMPLIED
version CDATA #IMPLIED>

<!ELEMENT text (body|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST text
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
fallback-langs CDATA #IMPLIED
id CDATA #IMPLIED>

<!ELEMENT body (div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST body
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED>

<!ELEMENT anchor EMPTY>
<!ATTLIST anchor time CDATA #REQUIRED>

<!ELEMENT silence EMPTY>
<!ATTLIST silence dur CDATA #REQUIRED>

<!ELEMENT graphic EMPTY>
<!ATTLIST graphic
url CDATA #REQUIRED
id CDATA #IMPLIED>

<!ELEMENT div (#PCDATA|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST div
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
type CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT span (#PCDATA|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST span
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
type CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT p (#PCDATA|span|anchor|silence|s|w)*>
<!ATTLIST p
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT s (#PCDATA|span|anchor|silence|w)*>
<!ATTLIST s
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT w (#PCDATA|span|syl)*>
<!ATTLIST w
xml:lang CDATA #IMPLIED
effective-g2p-lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
ARPABET CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT syl (#PCDATA|span)*>
<!ATTLIST syl
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
ARPABET CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT meta EMPTY>
<!ATTLIST meta name CDATA #REQUIRED
content CDATA #REQUIRED>
2 changes: 1 addition & 1 deletion readalongs/text/make_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=5.0">
<meta name="application-name" content="read along">
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=5.0">
<meta name="generator" content="@readalongs/studio-cli {studio_version}">
<meta name="generator" content="@readalongs/studio (cli) {studio_version}">
<title>{title}</title>
<script>{js}</script>
<style attribution="See https://fonts.google.com/attribution for copyrights and font attribution">{fonts}</style>
Expand Down
2 changes: 1 addition & 1 deletion readalongs/text/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def copy_file_to_zip(zip_path, origin_path, destination_path):
<meta charset="UTF-8">
<meta name="application-name" content="read along">
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=5.0">
<meta name="generator" content="@readalongs/studio-cli {studio_version}">
<meta name="generator" content="@readalongs/studio (cli) {studio_version}">
<title>{title}</title>
<!-- Import fonts. Material Icons are needed by the web component -->
<link href="https://fonts.googleapis.com/css?family=Lato%7CMaterial+Icons%7CMaterial+Icons+Outlined" rel="stylesheet">
Expand Down
8 changes: 6 additions & 2 deletions readalongs/web_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
from pydantic import BaseModel, Field
from starlette.background import BackgroundTask

from readalongs import VERSION
from readalongs._version import __version__
from readalongs.align import create_ras_from_text, save_label_files, save_subtitles
from readalongs.log import LOGGER, capture_logs
from readalongs.text.add_ids_to_xml import add_ids
Expand Down Expand Up @@ -77,7 +79,7 @@
# Call get_langs() when the server loads to load the languages into memory
LANGS = get_langs()
# Get the DTD
DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.0.dtd")
DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.1.dtd")
with open(DTDPATH) as dtdfh:
DTD = etree.DTD(dtdfh)

Expand Down Expand Up @@ -323,7 +325,8 @@ class ConvertRequest(BaseModel):
dedent(
"""\
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="%s">
<meta name="generator" content="@readalongs/studio (cli) %s"/>
<text xml:lang="dan" fallback-langs="und" id="t0">
<body id="t0b0">
<div type="page" id="t0b0d0">
Expand All @@ -337,6 +340,7 @@ class ConvertRequest(BaseModel):
</body>
</text>
</read-along>"""
% (VERSION, __version__)
)
],
)
Expand Down
2 changes: 1 addition & 1 deletion test/data/cs-ref.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="1.1">
<text xml:lang="fra" fallback-langs="und">
<body>
<div type="page">
Expand Down
3 changes: 2 additions & 1 deletion test/data/ej-fra-anchors.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<text xml:lang="fra">
<body>
<div type="page">
Expand Down
3 changes: 2 additions & 1 deletion test/data/ej-fra-anchors2.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<text xml:lang="fra">
<anchor time=".5s"/>
<body>
Expand Down
3 changes: 2 additions & 1 deletion test/data/ej-fra-dna.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<text xml:lang="fra">
<body>
<div type="page">
Expand Down
3 changes: 2 additions & 1 deletion test/data/ej-fra-package.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<text xml:lang="fra">
<body>
<div type="page">
Expand Down
3 changes: 2 additions & 1 deletion test/data/ej-fra-silence-bad.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<text xml:lang="fra">
<body>
<div type="page">
Expand Down
3 changes: 2 additions & 1 deletion test/data/ej-fra-silence.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<text xml:lang="fra">
<body>
<div type="page">
Expand Down
3 changes: 2 additions & 1 deletion test/data/ej-fra-subword.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<!-- To exclude any element from alignment, add the do-not-align="true" attribute to
it, e.g., <p do-not-align="true">...</p>, or
<s>Some text <foo do-not-align="true">do not align this</foo> more text</s> -->
Expand Down
3 changes: 2 additions & 1 deletion test/data/ej-fra-translated.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<!-- To exclude any element from alignment, add the do-not-align="true" attribute to
it, e.g., <p do-not-align="true">...</p>, or
<s>Some text <foo do-not-align="true">do not align this</foo> more text</s> -->
Expand Down
3 changes: 2 additions & 1 deletion test/data/ej-fra.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<text xml:lang="fra">
<body>
<div type="page">
Expand Down
Binary file added test/data/ej-fra.wav
Binary file not shown.
3 changes: 2 additions & 1 deletion test/data/fra-prepared.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}" />
<text xml:lang="fra" fallback-langs="und">
<body>
<div type="page">
Expand Down
3 changes: 2 additions & 1 deletion test/data/fra-tokenized.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<text xml:lang="fra">
<body>
<div type="page">
Expand Down
3 changes: 2 additions & 1 deletion test/data/mixed-langs.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<text>
<body>
<div type="page">
Expand Down
3 changes: 2 additions & 1 deletion test/data/patrickxtlan.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<read-along version="{{format_version}}">
<meta name="generator" content="@readalongs/studio (cli) {{studio_version}}"/>
<text>
<body>
<p>
Expand Down
11 changes: 8 additions & 3 deletions test/test_align_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from lxml.html import fromstring
from sound_swallower_stub import SoundSwallowerStub

from readalongs import VERSION
from readalongs._version import __version__
from readalongs.cli import align, langs


Expand Down Expand Up @@ -325,10 +327,13 @@ def test_bad_anchors(self):
"""Make sure invalid anchors yield appropriate errors"""

xml_text = """<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0"><text xml:lang="fra"><body><p>
<read-along version="%s"><meta name="generator" content="@readalongs/studio (cli) %s"/><text xml:lang="fra"><body><p>
<anchor /><s>Bonjour.</s><anchor time="invalid"/>
</p></body></text></read-along>
"""
""" % (
VERSION,
__version__,
)
xml_file = join(self.tempdir, "bad-anchor.readalong")
with open(xml_file, "w", encoding="utf8") as f:
print(xml_text, file=f)
Expand Down Expand Up @@ -626,4 +631,4 @@ def slurp_text(filename, encoding):


if __name__ == "__main__":
main()
main()
2 changes: 1 addition & 1 deletion test/test_dtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from lxml import etree

DTDPATH = os.path.join(
dirname(__file__), "..", "readalongs", "static", "read-along-1.0.dtd"
dirname(__file__), "..", "readalongs", "static", "read-along-1.1.dtd"
)

VALID_RAS = """
Expand Down
Loading

0 comments on commit b1cc821

Please sign in to comment.