home ~ projects ~ socials

Paring Word Parts With nom In Rust

This is prep work for Neopoligen. It's for looking for words that have characters in them that are also used for demarking spans

---
[dependencies]
nom = "7.1.3"
---

#![allow(unused_imports)]
use nom::IResult;
use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::bytes::complete::take;
use nom::bytes::complete::is_not;
use nom::character::complete::space1;
use nom::multi::many1;

#[derive(Debug, PartialEq)]
pub enum Token {
  WordSeg(String),
  EscapedChar(String),
  Space,
  Footnote(Vec<Token>),
}

fn main() {
  assert_eq!(parse("alfa"), Ok(
    ("", vec![
      Token::WordSeg("alfa".to_string())
    ])
  ));
  assert_eq!(parse("bravo charlie"), Ok(
    ("", vec![
      Token::WordSeg("bravo".to_string()),
      Token::Space,
      Token::WordSeg("charlie".to_string())
    ]))
  );
  assert_eq!(parse("^^example^^"), Ok(
    ("", vec![
      Token::Footnote(vec![
        Token::WordSeg("example".to_string())
      ])
    ])
  ));
  assert_eq!(parse(r#"de\^lta"#), Ok(
    ("", vec![
      Token::WordSeg("de".to_string()),
      Token::EscapedChar("^".to_string()),
      Token::WordSeg("lta".to_string()),
    ])
  ));
  println!("Finished tests");
}

pub fn parse(source: &str) -> IResult<&str, Vec<Token>> {
  let (source, result) = many1(alt((escaped, footnote, space, word_seg)))(source)?;
  Ok((source, result))
}

pub fn footnote(source: &str) -> IResult<&str, Token> {
  let (source, _) = tag("^^")(source)?;
  let (source, results) = parse(source)?;
  let (source, _) = tag("^^")(source)?;
  Ok((source, Token::Footnote(results)))
}


pub fn escaped(source: &str) -> IResult<&str, Token> {
  let (source, _) = tag(r#"\"#)(source)?;
  let (source, result) = take(1usize)(source)?;
  Ok((source, Token::EscapedChar(result.to_string())))
}

pub fn space(source: &str) -> IResult<&str, Token> {
  let (source, _) = space1(source)?;
  Ok((source, Token::Space))
}

pub fn word_seg(source: &str) -> IResult<&str, Token> {
  let (source, result) = is_not(r#" \^"#)(source)?;
  Ok((source, Token::WordSeg(result.to_string())))
}
Output:
Finished tests
-- end of line --