class Regexp::Lexer
A very thin wrapper around the scanner that breaks quantified literal runs, collects emitted tokens into an array, calculates their nesting depth, and normalizes tokens for the parser, and checks if they are implemented by the given syntax flavor.
Constants
- CLOSING_TOKENS
- CONDITION_TOKENS
- OPENING_TOKENS
Attributes
block[RW]
collect_tokens[RW]
conditional_nesting[RW]
nesting[RW]
preprev_token[RW]
prev_token[RW]
set_nesting[RW]
shift[RW]
tokens[RW]
Public Class Methods
lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 16 def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block) new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block) end
Also aliased as: scan
Public Instance Methods
emit(token)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 71 def emit(token) if block # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block res = block.call(token) tokens << res if collect_tokens else tokens << token end end
lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 20 def lex(input, syntax = nil, options: nil, collect_tokens: true, &block) syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT self.block = block self.collect_tokens = collect_tokens self.tokens = [] self.prev_token = nil self.preprev_token = nil self.nesting = 0 self.set_nesting = 0 self.conditional_nesting = 0 self.shift = 0 Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te| type, token = *syntax.normalize(type, token) syntax.check! type, token ascend(type, token) if (last = prev_token) && type == :quantifier && ( (last.type == :literal && (parts = break_literal(last))) || (last.token == :codepoint_list && (parts = break_codepoint_list(last))) ) emit(parts[0]) last = parts[1] end current = Regexp::Token.new(type, token, text, ts + shift, te + shift, nesting, set_nesting, conditional_nesting) if type == :conditional && CONDITION_TOKENS.include?(token) current = merge_condition(current, last) elsif last last.next = current current.previous = last emit(last) end self.preprev_token = last self.prev_token = current descend(type, token) end emit(prev_token) if prev_token collect_tokens ? tokens : nil end
Private Instance Methods
ascend(type, token)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 91 def ascend(type, token) case type when :group, :assertion self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token) when :set self.set_nesting = set_nesting - 1 if token == :close when :conditional self.conditional_nesting = conditional_nesting - 1 if token == :close end end
break_codepoint_list(token)
click to toggle source
if a codepoint list is followed by a quantifier, that quantifier applies to the last codepoint, e.g. /u{61 62 63}{3}/ =~ ‘abccc’ c.f. break_literal.
# File lib/regexp_parser/lexer.rb, line 135 def break_codepoint_list(token) lead, _, tail = token.text.rpartition(' ') return if lead.empty? token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}', token.ts, (token.te - tail.length), nesting, set_nesting, conditional_nesting) token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail, (token.ts + lead.length + 1), (token.te + 3), nesting, set_nesting, conditional_nesting) self.shift = shift + 3 # one space less, but extra \, u, {, and } token_1.previous = preprev_token token_1.next = token_2 token_2.previous = token_1 # .next will be set by #lex [token_1, token_2] end
break_literal(token)
click to toggle source
called by scan to break a literal run that is longer than one character into two separate tokens when it is followed by a quantifier
# File lib/regexp_parser/lexer.rb, line 115 def break_literal(token) lead, last, _ = token.text.partition(/.\z/mu) return if lead.empty? token_1 = Regexp::Token.new(:literal, :literal, lead, token.ts, (token.te - last.length), nesting, set_nesting, conditional_nesting) token_2 = Regexp::Token.new(:literal, :literal, last, (token.ts + lead.length), token.te, nesting, set_nesting, conditional_nesting) token_1.previous = preprev_token token_1.next = token_2 token_2.previous = token_1 # .next will be set by #lex [token_1, token_2] end
descend(type, token)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 102 def descend(type, token) case type when :group, :assertion self.nesting = nesting + 1 if OPENING_TOKENS.include?(token) when :set self.set_nesting = set_nesting + 1 if token == :open when :conditional self.conditional_nesting = conditional_nesting + 1 if token == :open end end
merge_condition(current, last)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 154 def merge_condition(current, last) token = Regexp::Token.new(:conditional, :condition, last.text + current.text, last.ts, current.te, nesting, set_nesting, conditional_nesting) token.previous = preprev_token # .next will be set by #lex token end