The words Under construction in black text on a yellow background with diagonal black stipes surrounding it
I'm in the process of moving my site. It's still a work in progress. Please excuse the mess and broken links.

Rust nom Parser V9

TODO: Pull subtitle into page object
Code
use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::bytes::complete::take;
use nom::character::complete::multispace1;
use nom::character::complete::none_of;
use nom::character::complete::one_of;
use nom::combinator::eof;
use nom::combinator::peek;
use nom::sequence::tuple;
use nom::IResult;
use nom::Parser;

fn main() {
    let text = vec![
        "- item alfa",
        "line bravo",
        "",
        "paragraph charlie",
        "with lines",
        "",
        "and another",
        "",
        "- item delta",
        "line echo",
        "",
        "# Header Foxtrot",
        "",
    ];
    let expected = vec![
        "-> list",
        "",
        "- item alfa",
        "line bravo",
        "",
        "-> p",
        "",
        "paragraph charlie",
        "with lines",
        "",
        "and another",
        "",
        "-> list",
        "",
        "- item delta",
        "line echo",
        "",
        "-> h1",
        "",
        "Header Foxtrot",
        "",
    ];
    let source = text.join("\n");
    let response = parse(source.as_str());
    println!("{}", response);
    assert_eq!(expected.join("\n"), response);
    println!("Test complete");
}

pub fn parse<'a>(source: &str) -> String {
    let mut payload = ("".to_string(), Area::Base);
    let mut output = "".to_string();
    let prepend_newline = format!("\n{}", source);
    let mut source = prepend_newline.as_str();
    loop {
        (source, payload) = crawler(source, payload.1).unwrap();
        output.push_str(payload.0.as_str());
        if source == "" {
            break;
        }
    }
    output
}

pub fn crawler(source: &str, area: Area) -> IResult<&str, (String, Area)> {
    match area {
        Area::Base => {
            let (a, b) = alt((
                // Clear any leading text so this always
                // starts on a new line
                multispace1.map(|_| (format!(""), Area::Base)),
                // Start a list if you find one
                tuple((tag("- "), take(1u32)))
                    .map(|x| (format!("-> list\n\n- {}", x.1), Area::List)),
                // Header
                tuple((tag("# "), take(1u32))).map(|x| (format!("-> h1\n\n{}", x.1), Area::List)),
                // This is what closes the processing
                eof.map(|_| (format!(""), Area::Base)),
                // Anything else is the start of a paragraph
                take(1u32).map(|x| (format!("-> p\n\n{}", x), Area::Paragraph)),
            ))(source)?;
            Ok((a, b))
        }
        Area::List => {
            let (a, b) = alt((
                tuple((tag("\n\n"), peek(none_of("-")))).map(|_| (format!("\n\n"), Area::Base)),
                tag("\n- ").map(|_| (format!("- "), Area::List)),
                take(1u32).map(|x| (format!("{}", x), Area::List)),
            ))(source)?;
            Ok((a, b))
        }
        Area::Paragraph => {
            let (a, b) = alt((
                // this could be done to take into account the `.`` after
                // the number with a more complex parser
                tuple((tag("\n\n"), peek(one_of("-#*1234567890"))))
                    .map(|_| (format!("\n\n"), Area::Base)),
                take(1u32).map(|x| (format!("{}", x), Area::Paragraph)),
            ))(source)?;
            Ok((a, b))
        }
    }
}

#[derive(Debug)]
pub enum Area {
    Base,
    List,
    Paragraph,
}

Debugging Stuff

I'm moving stuff around right now. All this below is helping me figure out where to put stuff

        -- title

Rust nom Parser V9

This is another evolution of the parser. 

It uses peek to look ahead to the next 
character to determine what to do on the 
next step. 

Each part keeps track of what type of
area it's in. 

I wouldn't be surprised to find a more 
effifient way to do this, but it's working 
for me to process 9,000 files with a much
more complicted version in less than 
30 seconds. I'm good with it. 


-- code
-- rust

use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::bytes::complete::take;
use nom::character::complete::multispace1;
use nom::character::complete::none_of;
use nom::character::complete::one_of;
use nom::combinator::eof;
use nom::combinator::peek;
use nom::sequence::tuple;
use nom::IResult;
use nom::Parser;

fn main() {
    let text = vec![
        "- item alfa",
        "line bravo",
        "",
        "paragraph charlie",
        "with lines",
        "",
        "and another",
        "",
        "- item delta",
        "line echo",
        "",
        "# Header Foxtrot",
        "",
    ];
    let expected = vec![
        "-> list",
        "",
        "- item alfa",
        "line bravo",
        "",
        "-> p",
        "",
        "paragraph charlie",
        "with lines",
        "",
        "and another",
        "",
        "-> list",
        "",
        "- item delta",
        "line echo",
        "",
        "-> h1",
        "",
        "Header Foxtrot",
        "",
    ];
    let source = text.join("\n");
    let response = parse(source.as_str());
    println!("{}", response);
    assert_eq!(expected.join("\n"), response);
    println!("Test complete");
}

pub fn parse<'a>(source: &str) -> String {
    let mut payload = ("".to_string(), Area::Base);
    let mut output = "".to_string();
    let prepend_newline = format!("\n{}", source);
    let mut source = prepend_newline.as_str();
    loop {
        (source, payload) = crawler(source, payload.1).unwrap();
        output.push_str(payload.0.as_str());
        if source == "" {
            break;
        }
    }
    output
}

pub fn crawler(source: &str, area: Area) -> IResult<&str, (String, Area)> {
    match area {
        Area::Base => {
            let (a, b) = alt((
                // Clear any leading text so this always
                // starts on a new line
                multispace1.map(|_| (format!(""), Area::Base)),
                // Start a list if you find one
                tuple((tag("- "), take(1u32)))
                    .map(|x| (format!("-> list\n\n- {}", x.1), Area::List)),
                // Header
                tuple((tag("# "), take(1u32))).map(|x| (format!("-> h1\n\n{}", x.1), Area::List)),
                // This is what closes the processing
                eof.map(|_| (format!(""), Area::Base)),
                // Anything else is the start of a paragraph
                take(1u32).map(|x| (format!("-> p\n\n{}", x), Area::Paragraph)),
            ))(source)?;
            Ok((a, b))
        }
        Area::List => {
            let (a, b) = alt((
                tuple((tag("\n\n"), peek(none_of("-")))).map(|_| (format!("\n\n"), Area::Base)),
                tag("\n- ").map(|_| (format!("- "), Area::List)),
                take(1u32).map(|x| (format!("{}", x), Area::List)),
            ))(source)?;
            Ok((a, b))
        }
        Area::Paragraph => {
            let (a, b) = alt((
                // this could be done to take into account the `.`` after
                // the number with a more complex parser
                tuple((tag("\n\n"), peek(one_of("-#*1234567890"))))
                    .map(|_| (format!("\n\n"), Area::Base)),
                take(1u32).map(|x| (format!("{}", x), Area::Paragraph)),
            ))(source)?;
            Ok((a, b))
        }
    }
}

#[derive(Debug)]
pub enum Area {
    Base,
    List,
    Paragraph,
}



-- blurb

-- categories
-- Rust 
-- nom 

-- metadata
-- date: 2023-03-28 18:50:52
-- id: 2nf8cgbe
-- site: aws
-- type: post
-- status: published