Skip to content

Commit

Permalink
feat: rudimentary regex support
Browse files Browse the repository at this point in the history
  • Loading branch information
fgdorais committed Dec 18, 2023
1 parent f130fd0 commit 4e9af27
Show file tree
Hide file tree
Showing 5 changed files with 241 additions and 0 deletions.
1 change: 1 addition & 0 deletions Parser.lean
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ import Parser.Char
import Parser.Error
import Parser.Parser
import Parser.Prelude
import Parser.RegEx
import Parser.Stream
5 changes: 5 additions & 0 deletions Parser/Char/Basic.lean
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Released under Apache 2.0 license as described in the file LICENSE.
-/

import Parser.Basic
import Parser.RegEx.Basic

namespace Parser.Char
variable {ε σ m} [Parser.Stream σ Char] [Parser.Error ε σ Char] [Monad m] [MonadExceptOf ε m]
Expand All @@ -20,6 +21,10 @@ def chars (tks : String) : ParserT ε σ Char m String :=
acc := acc.push (← token tk)
return acc

/-- `reMatch re` accepts and returns a string matching the regex `re`, otherwise fails -/
def reMatch (re : RegEx Char) : ParserT ε σ Char m String :=
String.mk <$> re.match

/-- `string tks` accepts and returns string `tks`, otherwise fails -/
def string [Parser.Error ε Substring Char] (tks : String) : ParserT ε Substring Char m String :=
withErrorMessage s!"expected {repr tks}" do
Expand Down
7 changes: 7 additions & 0 deletions Parser/RegEx.lean
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
/-
Copyright © 2022-2023 François G. Dorais, Kyrill Serdyuk, Emma Shroyer. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/

import Parser.RegEx.Basic
import Parser.RegEx.Compile
75 changes: 75 additions & 0 deletions Parser/RegEx/Basic.lean
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@

/-
Copyright © 2022-2023 François G. Dorais, Kyrill Serdyuk, Emma Shroyer. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/

import Parser.Basic

namespace Parser

/-- Type of regular expressions -/
inductive RegEx (α : Type _)
| /-- Character set -/
set : (α → Bool) → RegEx α
| /-- Alternation -/
alt : RegEx α → RegEx α → RegEx α
| /-- Concatenation -/
cat : RegEx α → RegEx α → RegEx α
| /-- Unbounded repetition -/
repMany : RegEx α → RegEx α
| /-- Bounded repetition -/
repUpTo : Nat → RegEx α → RegEx α
| /-- Grouping (unused) -/
group : RegEx α → RegEx α

namespace RegEx

/-- Empty character set -/
def fail : RegEx α := set fun _ => false

instance (α) : Inhabited (RegEx α) := ⟨fail⟩

/-- Universal character set -/
def any : RegEx α := set fun _ => true

/-- Nil expression -/
def nil : RegEx α := repUpTo 0 default

/-- Optional -/
def opt (e : RegEx α) := repUpTo 1 e

/-- Repetition -/
def rep (n : Nat) (e : RegEx α) :=
match n with
| 0 => repUpTo 0 e
| 1 => e
| n+1 => cat e (rep n e)

/-- Unbounded repetition, at least once -/
def repMany1 (e : RegEx α) := cat e (repMany e)

/-- Unbounded repetition, at least `n` times -/
def repManyN (n : Nat) (e : RegEx α) :=
match n with
| 0 => repMany e
| n+1 => cat e (repManyN n e)

section
variable {ε σ α β} [Parser.Stream σ α] [Parser.Error ε σ α] {m} [Monad m] [MonadExceptOf ε m]

/-- Fold over a regex match from the right -/
protected partial def foldr (f : α → β → β) : RegEx α → ParserT ε σ α m β → ParserT ε σ α m β
| .set s, k => tokenFilter s >>= fun x => f x <$> k
| .alt e₁ e₂, k => RegEx.foldr f e₁ k <|> RegEx.foldr f e₂ k
| .cat e₁ e₂, k => RegEx.foldr f e₁ (RegEx.foldr f e₂ k)
| .repUpTo 0 _, k => k
| .repUpTo (n+1) e, k => RegEx.foldr f e (RegEx.foldr f (.repUpTo n e) k) <|> k
| .repMany e, k => RegEx.foldr f e (RegEx.foldr f (.repMany e) k) <|> k
| .group e, k => RegEx.foldr f e k

/-- Parses tokens matching regex `re` returning the list of tokens, otherwise fails -/
protected def «match» (re : RegEx α) : ParserT ε σ α m (List α) :=
re.foldr (.::.) (pure [])

end
153 changes: 153 additions & 0 deletions Parser/RegEx/Compile.lean
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/-
Copyright © 2022-2023 François G. Dorais, Kyrill Serdyuk, Emma Shroyer. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/

import Parser.Char
import Parser.Char.Numeric
import Parser.RegEx.Basic

/-! ## RegEx Syntax
We currently use a very simplified form for regular expression syntax.
Operators:
* A match of `α|β` consists of either a match of `α` or a match of `β`
* A match of `αβ` consists of a match of `α` followed by a match of `β`
* A match of `α?` consists of at most one match of `α`
* A match of `α*` consists of zero or more back-to-back matches of `α`
* A match of `α+` consists of one or more back-to-back matches of `α`
* A match of `α{m}` consists of exactly `m` back-to-back matches of `α`
* A match of `α{m,n}` consists of at least `m` but at most `n` back-to-back matches of `α`
* A match of `α{m,}` consists of `m` or more back-to-back matches of `α`
* A match of `α{,n}` consists of at most `n` back-to-back matches of `α`
* A match of `(α)` consists of a match of `α`
These are listed from lowest to highest precedence.
Character matching:
* `.` matches any character.
* A single character matches itself with the exception of the special characters: `.`, `?`, `*`,
`+`, `|`, `\`, `(`, `)`, `{`, `}`, `[`, `]`. These special characters can be matched by
preceding them with an escape character `\`.
* `[s]` matches one character from the string `s`, the characters `\`, `[`, `]` must be preceded
by an escape character `\` within `s`.
* `[^s]` matches one character not in the string `s`, the characters `\`, `[`, `]` must be
preceded by an escape character `\` within `s`.
-/


namespace Parser.RegEx
open Char

private abbrev REParser := SimpleParser Substring Char

mutual

private partial def re0 : REParser (RegEx Char) :=
re1 >>= loop
where

loop (e : RegEx Char) := do
if ← test (char '|') then
loop (.alt e (← re1))
else
return e

private partial def re1 : REParser (RegEx Char) := do
re2 >>= loop <|> return .nil
where

loop (e : RegEx Char) := do
match ← option? re2 with
| some a => loop (.cat e a)
| none => return e

private partial def re2 : REParser (RegEx Char) :=
re3 >>= loop
where

loop (e : RegEx Char) := do
match ← option? <| first [star e, plus e, opt e, reps e] with
| some e => loop e
| none => return e

opt (e : RegEx Char) := do
char '?' *> return .opt e

star (e : RegEx Char) := do
char '*' *> return .repMany e

plus (e : RegEx Char) := do
char '+' *> return .repMany1 e

reps (e : RegEx Char) : REParser (RegEx Char) :=
withBacktracking do
let _ ← char '{'
let e ←
match ← option? ASCII.parseNat with
| some min =>
let emin : RegEx Char := RegEx.rep min e
match ← option? (char ',' *> option? ASCII.parseNat) with
| some (some max) => pure <| RegEx.cat emin (.repUpTo (max - min) e)
| some none => pure <| RegEx.cat emin (.repMany e)
| none => pure <| emin
| none =>
let max ← char ',' *> ASCII.parseNat
pure <| .repUpTo max e
let _ ← char '}'
return e

private partial def re3 : REParser (RegEx Char) := do
first [tok, any, set, grp]
where

any : REParser (RegEx Char) :=
char '.' *> return .any

grp : REParser (RegEx Char) :=
withBacktracking do
let _ ← char '('
let e ← re0
let _ ← char ')'
return e

setLoop (cs : List Char) := do
match ← option? <| tokenFilter (!['[', ']'].elem .) with
| some c =>
let c ← if c == '\\' then tokenFilter (['\\', '[', ']'].elem .) else pure c
setLoop (c :: cs)
| none => return cs

set : REParser (RegEx Char) :=
withBacktracking do
let _ ← char '['
let n ← test (char '^')
let s ← setLoop []
let _ ← char ']'
if n then
return .set (!s.elem .)
else
return .set (s.elem .)

tok : REParser (RegEx Char) := do
let special := ['.', '?', '*', '+', '|', '(', ')', '{', '}', '[', ']']
let c ← tokenFilter (!special.elem .)
let c ← if c == '\\' then tokenFilter (('\\' :: special).elem .) else pure c
return .set (. == c)

end

/-- Compiles a regex from a string, returns `none` on faiure -/
protected def compile? (s : String) : Option (RegEx Char) :=
match Parser.run (re0 <* endOfInput) s with
| .ok _ r => some r
| .error _ => none

/-- Compiles a regex from a string, panics on faiure -/
protected def compile! (s : String) : RegEx Char :=
match RegEx.compile? s with
| some r => r
| none => panic! "invalid regular expression"

0 comments on commit 4e9af27

Please sign in to comment.