! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: peg.ebnf kernel math.parser sequences assocs arrays fry math combinators regexp.classes strings splitting peg locals accessors regexp.ast ; IN: regexp.parser : allowed-char? ( ch -- ? ) ".()|[*+?" member? not ; ERROR: bad-number ; : ensure-number ( n -- n ) [ bad-number ] unless* ; :: at-error ( key assoc quot: ( key -- replacement ) -- value ) key assoc at* [ drop key quot call ] unless ; inline ERROR: bad-class name ; : name>class ( name -- class ) { { "Lower" letter-class } { "Upper" LETTER-class } { "Alpha" Letter-class } { "ASCII" ascii-class } { "Digit" digit-class } { "Alnum" alpha-class } { "Punct" punctuation-class } { "Graph" java-printable-class } { "Print" java-printable-class } { "Blank" non-newline-blank-class } { "Cntrl" control-character-class } { "XDigit" hex-digit-class } { "Space" java-blank-class } ! TODO: unicode-character-class } [ bad-class ] at-error ; : lookup-escape ( char -- ast ) { { CHAR: t [ CHAR: \t ] } { CHAR: n [ CHAR: \n ] } { CHAR: r [ CHAR: \r ] } { CHAR: f [ HEX: c ] } { CHAR: a [ HEX: 7 ] } { CHAR: e [ HEX: 1b ] } { CHAR: \\ [ CHAR: \\ ] } { CHAR: w [ c-identifier-class ] } { CHAR: W [ c-identifier-class ] } { CHAR: s [ java-blank-class ] } { CHAR: S [ java-blank-class ] } { CHAR: d [ digit-class ] } { CHAR: D [ digit-class ] } [ ] } case ; : options-assoc ( -- assoc ) H{ { CHAR: i case-insensitive } { CHAR: d unix-lines } { CHAR: m multiline } { CHAR: n multiline } { CHAR: r reversed-regexp } { CHAR: s dotall } { CHAR: u unicode-case } { CHAR: x comments } } ; : ch>option ( ch -- singleton ) options-assoc at ; : option>ch ( option -- string ) options-assoc value-at ; : parse-options ( on off -- options ) [ [ ch>option ] { } map-as ] bi@ ; : string>options ( string -- options ) "-" split1 parse-options ; : options>string ( options -- string ) [ on>> ] [ off>> ] bi [ [ option>ch ] map ] bi@ [ "-" glue ] unless-empty "" like ; ! TODO: add syntax for various parenthized things, ! add greedy and nongreedy forms of matching ! (once it's all implemented) EBNF: parse-regexp CharacterInBracket = !("}") Character QuotedCharacter = !("\\E") . Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class ]] | "P{" CharacterInBracket*:s "}" => [[ s >string name>class ]] | "Q" QuotedCharacter*:s "\\E" => [[ s ]] | "u" Character:a Character:b Character:c Character:d => [[ { a b c d } hex> ensure-number ]] | "x" Character:a Character:b => [[ { a b } hex> ensure-number ]] | "0" Character:a Character:b Character:c => [[ { a b c } oct> ensure-number ]] | . => [[ lookup-escape ]] EscapeSequence = "\\" Escape:e => [[ e ]] Character = EscapeSequence | . ?[ allowed-char? ]? AnyRangeCharacter = EscapeSequence | . RangeCharacter = !("]") AnyRangeCharacter Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b ]] | RangeCharacter StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b ]] | AnyRangeCharacter Ranges = StartRange:s Range*:r => [[ r s prefix ]] CharClass = "^"?:n Ranges:e => [[ e n char-class ]] Options = [idmsux]* Parenthized = "?:" Alternation:a => [[ a ]] | "?" Options:on "-"? Options:off ":" Alternation:a => [[ a on off parse-options ]] | "?#" [^)]* => [[ f ]] | "?~" Alternation:a => [[ a ]] | "?=" Alternation:a => [[ a ]] | "?!" Alternation:a => [[ a ]] | "?<=" Alternation:a => [[ a ]] | "? [[ a ]] | Alternation Element = "(" Parenthized:p ")" => [[ p ]] | "[" CharClass:r "]" => [[ r ]] | ".":d => [[ any-char ]] | Character Number = (!(","|"}").)* => [[ string>number ensure-number ]] Times = "," Number:n "}" => [[ 0 n ]] | Number:n ",}" => [[ n ]] | Number:n "}" => [[ n n ]] | "}" => [[ bad-number ]] | Number:n "," Number:m "}" => [[ n m ]] Repeated = Element:e "{" Times:t => [[ e t ]] | Element:e "?" => [[ e ]] | Element:e "*" => [[ e ]] | Element:e "+" => [[ e ]] | Element Concatenation = Repeated*:r => [[ r sift ]] Alternation = Concatenation:c ("|" Concatenation)*:a => [[ a empty? [ c ] [ a values c prefix ] if ]] End = !(.) Main = Alternation End ;EBNF