Skip to content

Commit

Permalink
Now support semi-colon ending controlwords
Browse files Browse the repository at this point in the history
  • Loading branch information
d0rianb committed Nov 15, 2023
1 parent 7af60c2 commit 3ab7d70
Show file tree
Hide file tree
Showing 9 changed files with 315 additions and 227 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
/target
/Cargo.lock
.idea
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ The library is split into 2 main components:
The lexer scan the document and return a `Vec<Token>` which represent the RTF file in a code-understandable manner.
To use it :
```rust
use rtf_parser::{Lexer, Parser};
use rtf_parser::{Lexer, Parser, Token};

let tokens: Vec<Token> = Lexer::scan("<rtf>");
```
Expand Down
26 changes: 26 additions & 0 deletions RTF-test.rtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
\rtf1\ansi\ansicpg1252\cocoartf2639
\cocoatextscaling0\cocoaplatform0
{
\fonttbl
\f0
\fswiss
\fcharset0 Helvetica;
\f1
\fswiss
\fcharset0 Helvetica-Bold;
}
{
\colortbl;
\red255
\green255
\blue255;
}
{\*\expandedcolortbl;;}
\paperw11900\paperh16840\margl1440\margr1440\vieww25400\viewh13640\viewkind0
\pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\pardirnatural\partightenfactor0

\f0\fs24 \cf0 Je suis un document RTF\
Test en
\f1\b gras
}
15 changes: 5 additions & 10 deletions src/header.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use std::collections::HashMap;
use crate::{ControlWord, Token};
use crate::tokens::{ControlWord, Token};

pub type FontRef = u16;
pub type FontTable<'a> = HashMap<FontRef, Font<'a>>;
Expand All @@ -18,19 +18,16 @@ pub struct Font<'a> {
}

#[allow(dead_code)]
#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Default)]
pub enum CharacterSet {
#[default]
Ansi,
Mac,
Pc,
Pca,
Ansicpg(u16),
}

impl Default for CharacterSet {
fn default() -> Self { CharacterSet::Ansi }
}

impl CharacterSet {
pub fn from(token: &Token) -> Option<Self> {
match token {
Expand All @@ -42,8 +39,9 @@ impl CharacterSet {
}

#[allow(dead_code)]
#[derive(Debug, PartialEq, Hash, Clone)]
#[derive(Debug, PartialEq, Hash, Clone, Default)]
pub enum FontFamily {
#[default]
Nil,
Roman,
Swiss,
Expand All @@ -54,9 +52,6 @@ pub enum FontFamily {
Bidi,
}

impl Default for FontFamily {
fn default() -> Self { FontFamily::Nil }
}

impl FontFamily {
pub fn from(string: &str) -> Option<Self> {
Expand Down
36 changes: 24 additions & 12 deletions src/lexer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::tokens::{ControlWord, Token};
use crate::utils::StrUtils;
use crate::{ControlWord, Token};

pub struct Lexer;

Expand Down Expand Up @@ -43,6 +43,7 @@ impl Lexer {
return tokens;
}

/// Get a string slice cut but the scanner and return the coreesponding token(s)
fn tokenize(slice: &str) -> Vec<Token> {
let mut starting_chars = slice.trim_matches(' ').chars().take(2);
return match (starting_chars.next(), starting_chars.next()) {
Expand All @@ -66,8 +67,11 @@ impl Lexer {
'a'..='z' => {
// Identify control word
// ex: parse "\b Words in bold" -> (Token::ControlWord(ControlWord::Bold), Token::ControlWordArgument("Words in bold")
let (ident, tail) = slice.split_first_whitespace();
let mut ret = vec![Token::ControlSymbol(ControlWord::from(ident))];
let (mut ident, tail) = slice.split_first_whitespace();
// if iednt end with semicolon, strip it for correct value parsing
ident = if ident.chars().last().unwrap_or(' ') == ';' { &ident[0..ident.len() - 1] } else { ident };
let control_word = ControlWord::from(ident);
let mut ret = vec![Token::ControlSymbol(control_word)];
if tail.len() > 0 {
ret.push(Token::PlainText(tail));
}
Expand Down Expand Up @@ -97,9 +101,9 @@ impl Lexer {
#[cfg(test)]
pub(crate) mod tests {
use crate::lexer::Lexer;
use crate::ControlWord::{Ansi, Bold, FontNumber, FontSize, FontTable, Rtf, Unknown};
use crate::Property::*;
use crate::Token::*;
use crate::tokens::ControlWord::{Ansi, Bold, FontNumber, FontSize, FontTable, Rtf, Unknown};
use crate::tokens::Property::*;
use crate::tokens::Token::*;

#[test]
fn simple_tokenize_test() {
Expand Down Expand Up @@ -174,11 +178,19 @@ if (a == b) \{\
fn scan_ignorable_destination() {
let text = r"{\*\expandedcolortbl;;}";
let tokens = Lexer::scan(text);
assert_eq!(tokens, vec![
OpeningBracket,
IgnorableDestination,
ControlSymbol((Unknown(r"\expandedcolortbl;;"), None)),
ClosingBracket,
])
assert_eq!(
tokens,
vec![OpeningBracket, IgnorableDestination, ControlSymbol((Unknown(r"\expandedcolortbl;"), None)), ClosingBracket,]
)
}

#[test]
fn should_parse_control_symbol_ending_semicolon() {
let text = r"{\red255\blue255;}";
let tokens = Lexer::scan(text);
assert_eq!(
tokens,
vec![OpeningBracket, ControlSymbol((Unknown(r"\red"), Value(255))), ControlSymbol((Unknown(r"\blue"), Value(255))), ClosingBracket]
);
}
}
132 changes: 6 additions & 126 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,135 +2,15 @@
// Support RTF version 1.9.1
// specification is available here : https://dokumen.tips/documents/rtf-specification.html

#![allow(irrefutable_let_patterns)]

mod tokens;
mod lexer;
mod parser;
mod header;
mod utils;

// expose the lexer and the parser
pub use crate::lexer::Lexer as Lexer;
pub use crate::parser::Parser as Parser;

#[allow(dead_code)]
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum Token<'a> {
PlainText(&'a str),
OpeningBracket,
ClosingBracket,
CRLF, // Line-return \n
IgnorableDestination, // \*\ <destination-name>
ControlSymbol(ControlSymbol<'a>),
}


// A control symbol is a pair (control_word, property)
// In the RTF specifiaction, it refer to 'control word entity'
type ControlSymbol<'a> = (ControlWord<'a>, Property);

// Parameters for a control word
#[allow(dead_code)]
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum Property {
On, // 1
Off, // 0
Value(i32),
None, // No parameter
}

impl Property {
fn as_bool(&self) -> bool {
match self {
Property::On => true,
Property::Off => false,
Property::None => true,
Property::Value(val) => if *val == 1 { true } else { false },
}
}

fn get_value(&self) -> i32 {
if let Property::Value(value) = &self {
return *value;
}
return 0;
}
}

#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum ControlWord<'a> {
Rtf,
Ansi,

FontTable,
FontCharset,
FontNumber,
FontSize,

ColorTable,
FileTable,

Italic,
Bold,
Underline,

Unknown(&'a str),
}

impl<'a> ControlWord<'a> {
pub fn from(input: &str) -> ControlSymbol {
// Loop backward the string to get the number
let mut it = input.chars().rev();
let mut suffix_index = 0;
while let Some(c) = it.next() {
match c {
'0'..='9' | '-' => {
suffix_index += 1;
}
_ => break,
}
}

// f0 -> prefix: f, suffix: 0
let index = input.len() - suffix_index;
let prefix = &input[..index];
let suffix = &input[index..];

let property = if suffix == "" {
Property::None
} else {
Property::Value(suffix.parse::<i32>().expect(&format!("[Lexer] Unable to parse {} as integer", &suffix)))
};

let control_word = match prefix {
r"\rtf" => ControlWord::Rtf,
r"\ansi" => ControlWord::Ansi,
r"\fonttbl" => ControlWord::FontTable,
r"\colortabl" => ControlWord::ColorTable,
r"\filetbl" => ControlWord::FileTable,
r"\fcharset" => ControlWord::FontCharset,
r"\f" => ControlWord::FontNumber,
r"\fs" => ControlWord::FontSize,
r"\i" => ControlWord::Italic,
r"\b" => ControlWord::Bold,
r"\u" => ControlWord::Underline,
_ => ControlWord::Unknown(prefix),
};
return (control_word, property);
}
}

#[cfg(test)]
mod tests {
use crate::{ControlWord, Property};

#[test]
fn control_word_from_input_test() {
let input = r"\rtf1";
assert_eq!(ControlWord::from(input), (ControlWord::Rtf, Property::Value(1)))
}

#[test]
fn control_word_with_negative_parameter() {
let input = r"\rtf-1";
assert_eq!(ControlWord::from(input), (ControlWord::Rtf, Property::Value(-1)))
}
}
pub use crate::lexer::Lexer;
pub use crate::parser::{Parser, Painter};
pub use crate::tokens::Token;
Loading

0 comments on commit 3ab7d70

Please sign in to comment.