Class: RE2::Scanner
- Includes:
- Enumerable
- Defined in:
- ext/re2/re2.cc,
lib/re2/scanner.rb
Instance Method Summary collapse
- #each ⇒ Object
-
#eof? ⇒ Boolean
Returns whether the Scanner has consumed all input or not.
- #initialize_copy(other) ⇒ Object
- #regexp ⇒ RE2::Regexp
-
#rewind ⇒ Object
Rewind the Scanner to the start of the string.
-
#scan ⇒ Array<String>, ...
Scan the given text incrementally for matches using
FindAndConsume, returning an array of submatches on each subsequent call. -
#string ⇒ String
Returns a frozen copy of the text supplied when incrementally matching with Regexp#scan.
Instance Method Details
#each ⇒ Object
16 17 18 19 20 21 22 23 24 |
# File 'lib/re2/scanner.rb', line 16 def each if block_given? while matches = scan yield matches end else to_enum(:each) end end |
#eof? ⇒ Boolean
Returns whether the RE2::Scanner has consumed all input or not.
365 366 367 368 369 |
# File 'ext/re2/re2.cc', line 365
static VALUE re2_scanner_eof(const VALUE self) {
re2_scanner *c = unwrap_re2_scanner(self);
return BOOL2RUBY(c->eof);
}
|
#initialize_copy(other) ⇒ Object
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 |
# File 'ext/re2/re2.cc', line 398
static VALUE re2_scanner_initialize_copy(VALUE self, VALUE other) {
re2_scanner *self_c;
re2_scanner *other_c = unwrap_re2_scanner(other);
TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, self_c);
if (self_c->input) {
delete self_c->input;
self_c->input = nullptr;
}
RB_OBJ_WRITE(self, &self_c->regexp, other_c->regexp);
RB_OBJ_WRITE(self, &self_c->text, other_c->text);
self_c->number_of_capturing_groups = other_c->number_of_capturing_groups;
self_c->eof = other_c->eof;
if (other_c->input) {
self_c->input = new(std::nothrow) re2::StringPiece(*other_c->input);
if (self_c->input == nullptr) {
rb_raise(rb_eNoMemError,
"not enough memory to allocate StringPiece for input");
}
} else {
self_c->input = nullptr;
}
return self;
}
|
#regexp ⇒ RE2::Regexp
Returns the Regexp used in the RE2::Scanner.
755 756 757 758 759 |
# File 'ext/re2/re2.cc', line 755
static VALUE re2_scanner_regexp(const VALUE self) {
re2_scanner *c = unwrap_re2_scanner(self);
return c->regexp;
}
|
#rewind ⇒ Object
Rewind the RE2::Scanner to the start of the string.
382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 |
# File 'ext/re2/re2.cc', line 382
static VALUE re2_scanner_rewind(VALUE self) {
re2_scanner *c = unwrap_re2_scanner(self);
delete c->input;
c->input = new(std::nothrow) re2::StringPiece(
RSTRING_PTR(c->text), RSTRING_LEN(c->text));
if (c->input == nullptr) {
rb_raise(rb_eNoMemError,
"not enough memory to allocate StringPiece for input");
}
c->eof = false;
return self;
}
|
#scan ⇒ Array<String>, ...
Scan the given text incrementally for matches using
FindAndConsume, returning an array of submatches on each subsequent
call. Returns nil if no matches are found or an empty array for every
match if the pattern has no capturing groups.
Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
Regexp is set to false (any other encoding's behaviour is undefined).
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 |
# File 'ext/re2/re2.cc', line 446
static VALUE re2_scanner_scan(VALUE self) {
re2_scanner *c = unwrap_re2_scanner(self);
re2_pattern *p = unwrap_re2_regexp(c->regexp);
std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
std::vector<re2::StringPiece> matches(c->number_of_capturing_groups);
if (c->eof) {
return Qnil;
}
re2::StringPiece::size_type original_input_size = c->input->size();
for (int i = 0; i < c->number_of_capturing_groups; ++i) {
argv[i] = &matches[i];
args[i] = &argv[i];
}
if (RE2::FindAndConsumeN(c->input, *p->pattern, args.data(),
c->number_of_capturing_groups)) {
re2::StringPiece::size_type new_input_size = c->input->size();
bool input_advanced = new_input_size < original_input_size;
VALUE result = rb_ary_new2(c->number_of_capturing_groups);
for (int i = 0; i < c->number_of_capturing_groups; ++i) {
if (matches[i].data() == nullptr) {
rb_ary_push(result, Qnil);
} else {
rb_ary_push(result, encoded_str_new(matches[i].data(),
matches[i].size(),
p->pattern->options().encoding()));
}
}
/* Check whether we've exhausted the input yet. */
c->eof = new_input_size == 0;
/* If the match didn't advance the input, we need to do this ourselves,
* advancing by a whole character to avoid splitting multi-byte characters.
*
* The lookup table approach is taken from RE2's own Python extension: the
* high 4 bits of a UTF-8 lead byte determine the character's byte length.
*
* See https://github.com/google/re2/blob/972a15cedd008d846f1a39b2e88ce48d7f166cbd/python/_re2.cc#L46-L48
*/
if (!input_advanced && new_input_size > 0) {
size_t char_size = 1;
if (p->pattern->options().encoding() == RE2::Options::EncodingUTF8) {
char_size = "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"
[((*c->input)[0] & 0xFF) >> 4];
if (char_size > new_input_size) {
char_size = new_input_size;
}
}
c->input->remove_prefix(char_size);
}
return result;
} else {
return Qnil;
}
}
|
#string ⇒ String
Returns a frozen copy of the text supplied when incrementally matching with Regexp#scan.
If the text was already a frozen string, returns the original.
351 352 353 354 355 |
# File 'ext/re2/re2.cc', line 351
static VALUE re2_scanner_string(const VALUE self) {
re2_scanner *c = unwrap_re2_scanner(self);
return c->text;
}
|