Generic MathML Errors using Parser Lookahead (some endpoints updated) (…

…#386) ### Changes - Generic MathML Parser Error update: - Added tag level errors to `generic_mathml.rs` parser: `<mi>`, `<mn>`, `<msup>`, `<msub>`, `<msqrt>`, `<mfrac>`, `<mrow>`, `<munder>`, `<mover>`, `<msubsup>`, `<mtext>`, `<mstyle>`, `<mspace>`, `<mo>`. - `/mathml/ast-graph` endpoint now shows these errors. - - First Order ODE Parser Error update: - Updated `ParseError` messages using the `context` combinator, removing the previous macro usage. - The generic MathML errors were excluded as this parser uses `interpreted_mathml.rs`, which doesn't encounter those errors at the math_expression level. - `/pmml/equations-to-amr` and `/latex/equations-to-amr` are passing on these errors. from `skema-rs` ### Notes - Lookahead Algorithm: - Solved the problem of adding tag level parse errors by implementing a lookahead in the parser. - In `math_expression`, instead of using `alt` for multiple branches of parsers, the following steps were adopted: 1. Grab the content of the next tag. 2. If it is an open tag, call the appropriate parser. If the parser fails, we can immediately stop execution with [`cut`](https://tikv.github.io/doc/nom/combinator/fn.cut.html) because of the lookahead knowledge. 3. If the tag was a close tag, return an `Error` instead of a `Failure`. `Failure` cuts the execution, but returning an `Error` allows the parent combinator to continue using parsers on the remaining input. - This approach enables `many0` and other combinators to work as expected. When we run out of things (like math expressions) for `many0` to match (encountered a close tag), we return an `Error`, allowing the parent combinator to continue. But, as long as we know there is an expression to match (open tag), we can guarantee that if the internal parser (for `<mi>`, `<mo>`, etc.) fails, it was due to bad input. ### Testing - `cargo test` and `cargo clippy` passing.
ml4ai · Aug 4, 2023 · 1198e53 · 1198e53
1 parent e8b2ac7
commit 1198e53
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 65 deletions.
diff --git a/skema/rest/workflows.py b/skema/rest/workflows.py
@@ -5,15 +5,16 @@
 
 
 from typing import List
-from skema.rest.proxies import SKEMA_RS_ADDESS
-from skema.rest import schema, utils, comments_proxy
-from skema.program_analysis.comments import MultiFileCodeComments
-from skema.img2mml import eqn2mml
-from skema.skema_py import server as code2fn
+
+import requests
 from fastapi import APIRouter, File, UploadFile
 from starlette.responses import JSONResponse
-import requests
 
+from skema.img2mml import eqn2mml
+from skema.program_analysis.comments import MultiFileCodeComments
+from skema.rest import comments_proxy, schema, utils
+from skema.rest.proxies import SKEMA_RS_ADDESS
+from skema.skema_py import server as code2fn
 
 router = APIRouter()
 
@@ -89,7 +90,7 @@ async def equations_to_amr(data: schema.EquationLatexToAMR):
         return JSONResponse(
             status_code=400,
             content={
-                "error": f"MORAE PUT /mathml/amr failed to process payload",
+                "error": f"MORAE PUT /mathml/amr failed to process payload with error {res.text}",
                 "payload": payload,
             },
         )
@@ -99,14 +100,13 @@ async def equations_to_amr(data: schema.EquationLatexToAMR):
 # pmml -> amr
 @router.post("/pmml/equations-to-amr", summary="Equations pMML → AMR")
 async def equations_to_amr(data: schema.MmlToAMR):
-
     payload = {"mathml": data.equations, "model": data.model}
     res = requests.put(f"{SKEMA_RS_ADDESS}/mathml/amr", json=payload)
     if res.status_code != 200:
         return JSONResponse(
             status_code=400,
             content={
-                "error": f"MORAE PUT /mathml/amr failed to process payload",
+                "error": f"MORAE PUT /mathml/amr failed to process payload with error {res.text}",
                 "payload": payload,
             },
         )
@@ -147,6 +147,7 @@ async def code_snippets_to_rn_amr(system: code2fn.System):
     return res.json()
 """
 
+
 # zip archive -> fn -> petrinet amr
 @router.post(
     "/code/codebase-to-pn-amr", summary="Code repo (zip archive) → PetriNet AMR"

diff --git a/skema/skema-rs/mathml/src/parsers/first_order_ode.rs b/skema/skema-rs/mathml/src/parsers/first_order_ode.rs
@@ -7,9 +7,7 @@ use crate::{
         Ci, MathExpression, Type,
     },
     parsers::{
-        generic_mathml::{
-            append_msg_to_parse_err, attribute, equals, etag, stag, ws, IResult, Span,
-        },
+        generic_mathml::{attribute, equals, etag, stag, ws, IResult, Span},
         interpreted_mathml::{
             ci_univariate_func, ci_unknown, first_order_derivative_leibniz_notation,
             math_expression, newtonian_derivative, operator,
@@ -20,6 +18,7 @@ use crate::{
 
 use derive_new::new;
 
+use nom::error::context;
 use nom::{
     branch::alt,
     bytes::complete::tag,
@@ -52,36 +51,40 @@ pub struct FirstOrderODE {
 
 /// Parse a first order ODE with a single derivative term on the LHS.
 pub fn first_order_ode(input: Span) -> IResult<FirstOrderODE> {
-    let (s, _) = stag!("math")(input)
-        .map_err(|err| append_msg_to_parse_err!(err, "MISSING STARTING <math> TAG."))?;
+    let (s, _) = context("MISSING STARTING <math> TAG.", stag!("math"))(input)?;
 
     // Recognize LHS derivative
-    let (s, (_, ci)) = alt((
-        first_order_derivative_leibniz_notation,
-        newtonian_derivative,
-    ))(s)
-    .map_err(|err| append_msg_to_parse_err!(err, "INVALID DERIVATIVE ON LHS."))?;
+    let (s, (_, ci)) = context(
+        "INVALID LHS DERIVATIVE.",
+        alt((
+            first_order_derivative_leibniz_notation,
+            newtonian_derivative,
+        )),
+    )(s)?;
 
     // Recognize equals sign
-    let (s, _) = delimited(stag!("mo"), equals, etag!("mo"))(s)
-        .map_err(|err| append_msg_to_parse_err!(err, "MISSING EQUALS SIGN."))?;
+    let (s, _) = context(
+        "MISSING EQUALS SIGN.",
+        delimited(stag!("mo"), equals, etag!("mo")),
+    )(s)?;
 
     // Recognize other tokens
-    let (s, remaining_tokens) = many1(alt((
-        map(ci_univariate_func, MathExpression::Ci),
-        map(ci_unknown, |Ci { content, .. }| {
-            MathExpression::Ci(Ci {
-                r#type: Some(Type::Function),
-                content,
-            })
-        }),
-        map(operator, MathExpression::Mo),
-        math_expression,
-    )))(s)
-    .map_err(|err| append_msg_to_parse_err!(err, "COULD NOT PARSE RHS."))?;
-
-    let (s, _) = etag!("math")(s)
-        .map_err(|err| append_msg_to_parse_err!(err, "MISSING ENDING </math> tag."))?;
+    let (s, remaining_tokens) = context(
+        "INVALID RHS.",
+        many1(alt((
+            map(ci_univariate_func, MathExpression::Ci),
+            map(ci_unknown, |Ci { content, .. }| {
+                MathExpression::Ci(Ci {
+                    r#type: Some(Type::Function),
+                    content,
+                })
+            }),
+            map(operator, MathExpression::Mo),
+            math_expression,
+        ))),
+    )(s)?;
+
+    let (s, _) = context("INVALID ENDING MATH TAG", etag!("math"))(s)?;
 
     let ode = FirstOrderODE {
         lhs_var: ci,

diff --git a/skema/skema-rs/mathml/src/parsers/generic_mathml.rs b/skema/skema-rs/mathml/src/parsers/generic_mathml.rs
@@ -11,10 +11,12 @@ use nom::{
     branch::alt,
     bytes::complete::{tag, take_until},
     character::complete::{alphanumeric1, multispace0, not_line_ending},
-    combinator::{map, map_parser, opt, recognize, value},
+    combinator::{cut, map, map_parser, opt, peek, recognize, value},
     multi::many0,
     sequence::{delimited, pair, preceded, separated_pair, tuple},
 };
+use nom::{character::complete::char as nom_char, error::context};
+
 use nom_locate::LocatedSpan;
 use std::str::FromStr;
 
@@ -94,16 +96,6 @@ pub fn attribute(input: Span) -> IResult<(&str, &str)> {
     Ok((s, (&key, &value)))
 }
 
-#[macro_export]
-macro_rules! append_msg_to_parse_err {
-    ($mapped_err:expr, $msg: expr) => {{
-        $mapped_err.map(|mut my_err| {
-            my_err.append_message($msg);
-            return my_err;
-        })
-    }};
-}
-
 #[macro_export]
 macro_rules! stag {
     ($tag:expr) => {{
@@ -327,23 +319,51 @@ fn mo_line(input: Span) -> IResult<MathExpression> {
 
 /// Math expressions
 pub fn math_expression(input: Span) -> IResult<MathExpression> {
-    ws(alt((
-        map(mi, MathExpression::Mi),
-        mn,
-        msup,
-        msub,
-        msqrt,
-        mfrac,
-        map(mrow, MathExpression::Mrow),
-        munder,
-        mover,
-        msubsup,
-        mtext,
-        mstyle,
-        mspace,
-        mo_line,
-        mo,
-    )))(input)
+    // Lookahead for next open tag
+    let tag_name = peek(delimited(
+        multispace0,
+        delimited(
+            nom_char('<'),
+            take_until(">"),
+            alt((tag(">"), tag("/>"))), // Matches both self-closing and regular tags
+        ),
+        multispace0,
+    ))(input)
+    .map(|(_, tag_name)| {
+        let tag_name_string = tag_name.to_string();
+        let mut split_tag_name = tag_name_string.split_whitespace(); // We only want the tag name and no attributes
+        split_tag_name.next().unwrap().to_string()
+    })?;
+
+    if tag_name.contains('/') {
+        // Found a closing tag! This means no more math expressions, but is not wrong.
+        // We want the parent combinator to still continue to try and parse the remaining input
+        mn(input)
+    } else {
+        match tag_name.as_str() {
+            "mi" => context("FAILED TO PARSE <mi>", cut(ws(map(mi, MathExpression::Mi))))(input),
+            "mn" => context("FAILED TO PARSE <mn>", cut(ws(mn)))(input),
+            "msup" => context("FAILED TO PARSE <msup>", cut(ws(msup)))(input),
+            "msub" => context("FAILED TO PARSE <msub>", cut(ws(msub)))(input),
+            "msqrt" => context("FAILED TO PARSE <msqrt>", cut(ws(msqrt)))(input),
+            "mfrac" => context("FAILED TO PARSE <mfrac>", cut(ws(mfrac)))(input),
+            "mrow" => context(
+                "FAILED TO PARSE <mrow>",
+                cut(map(mrow, MathExpression::Mrow)),
+            )(input),
+            "munder" => context("FAILED TO PARSE <munder>", cut(ws(munder)))(input),
+            "mover" => context("FAILED TO PARSE <mover>", cut(ws(mover)))(input),
+            "msubsup" => context("FAILED TO PARSE <msubsup>", cut(ws(msubsup)))(input),
+            "mtext" => context("FAILED TO PARSE <mtext>", cut(ws(mtext)))(input),
+            "mstyle" => context("FAILED TO PARSE <mstyle>", cut(ws(mstyle)))(input),
+            "mspace" => context("FAILED TO PARSE <mspace>", cut(ws(mspace)))(input),
+            "mo" => context("FAILED TO PARSE <mo>", cut(ws(alt((mo, mo_line)))))(input),
+            _ => {
+                println!("Something went wrong. We grabbed a {} tag", tag_name);
+                context("SOMETHING WENT WRONG. WE SHOULDN'T BE HERE.", cut(mn))(input)
+            }
+        }
+    }
 }
 
 /// testing MathML documents
@@ -571,7 +591,6 @@ fn test_mathml_parser() {
 }
 
 // Exporting macros
-pub(crate) use append_msg_to_parse_err;
 pub(crate) use elem2;
 pub(crate) use elem_many0;
 pub(crate) use etag;