Class: RE2::Scanner

Inherits:
Object show all
Includes:
Enumerable
Defined in:
ext/re2/re2.cc,
lib/re2/scanner.rb

Instance Method Summary collapse

Instance Method Details

#eachObject



16
17
18
19
20
21
22
23
24
# File 'lib/re2/scanner.rb', line 16

def each
  if block_given?
    while matches = scan
      yield matches
    end
  else
    to_enum(:each)
  end
end

#eof?Boolean

Returns whether the RE2::Scanner has consumed all input or not.

Examples:

c = RE2::Regexp.new('(\d+)').scan("foo")
c.eof? #=> true

Returns:

  • (Boolean)

    whether the RE2::Scanner has consumed all input or not



365
366
367
368
369
# File 'ext/re2/re2.cc', line 365

static VALUE re2_scanner_eof(const VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  return BOOL2RUBY(c->eof);
}

#initialize_copy(other) ⇒ Object



398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
# File 'ext/re2/re2.cc', line 398

static VALUE re2_scanner_initialize_copy(VALUE self, VALUE other) {
  re2_scanner *self_c;
  re2_scanner *other_c = unwrap_re2_scanner(other);

  TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, self_c);

  if (self_c->input) {
    delete self_c->input;
    self_c->input = nullptr;
  }

  RB_OBJ_WRITE(self, &self_c->regexp, other_c->regexp);
  RB_OBJ_WRITE(self, &self_c->text, other_c->text);
  self_c->number_of_capturing_groups = other_c->number_of_capturing_groups;
  self_c->eof = other_c->eof;

  if (other_c->input) {
    self_c->input = new(std::nothrow) re2::StringPiece(*other_c->input);
    if (self_c->input == nullptr) {
      rb_raise(rb_eNoMemError,
               "not enough memory to allocate StringPiece for input");
    }
  } else {
    self_c->input = nullptr;
  }

  return self;
}

#regexpRE2::Regexp

Returns the Regexp used in the RE2::Scanner.

Examples:

c = RE2::Regexp.new('(\d+)').scan("bob 123")
c.regexp #=> #<RE2::Regexp /(\d+)/>

Returns:



755
756
757
758
759
# File 'ext/re2/re2.cc', line 755

static VALUE re2_scanner_regexp(const VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  return c->regexp;
}

#rewindObject

Rewind the RE2::Scanner to the start of the string.

Examples:

s = RE2::Regexp.new('(\d+)').scan("1 2 3")
e = s.to_enum
e.scan #=> ["1"]
e.scan #=> ["2"]
s.rewind
e.scan #=> ["1"]


382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
# File 'ext/re2/re2.cc', line 382

static VALUE re2_scanner_rewind(VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  delete c->input;
  c->input = new(std::nothrow) re2::StringPiece(
      RSTRING_PTR(c->text), RSTRING_LEN(c->text));
  if (c->input == nullptr) {
    rb_raise(rb_eNoMemError,
             "not enough memory to allocate StringPiece for input");
  }

  c->eof = false;

  return self;
}

#scanArray<String>, ...

Scan the given text incrementally for matches using FindAndConsume, returning an array of submatches on each subsequent call. Returns nil if no matches are found or an empty array for every match if the pattern has no capturing groups.

Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the Regexp is set to false (any other encoding's behaviour is undefined).

Examples:

s = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
s.scan #=> ["Foo"]
s.scan #=> ["bar"]

Returns:

  • (Array<String>)

    if the pattern has capturing groups

  • ([])

    if the pattern does not have capturing groups

  • (nil)

    if no matches are found



446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
# File 'ext/re2/re2.cc', line 446

static VALUE re2_scanner_scan(VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);
  re2_pattern *p = unwrap_re2_regexp(c->regexp);

  std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
  std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
  std::vector<re2::StringPiece> matches(c->number_of_capturing_groups);

  if (c->eof) {
    return Qnil;
  }

  re2::StringPiece::size_type original_input_size = c->input->size();

  for (int i = 0; i < c->number_of_capturing_groups; ++i) {
    argv[i] = &matches[i];
    args[i] = &argv[i];
  }

  if (RE2::FindAndConsumeN(c->input, *p->pattern, args.data(),
        c->number_of_capturing_groups)) {
    re2::StringPiece::size_type new_input_size = c->input->size();
    bool input_advanced = new_input_size < original_input_size;

    VALUE result = rb_ary_new2(c->number_of_capturing_groups);

    for (int i = 0; i < c->number_of_capturing_groups; ++i) {
      if (matches[i].data() == nullptr) {
        rb_ary_push(result, Qnil);
      } else {
        rb_ary_push(result, encoded_str_new(matches[i].data(),
              matches[i].size(),
              p->pattern->options().encoding()));
      }
    }

    /* Check whether we've exhausted the input yet. */
    c->eof = new_input_size == 0;

    /* If the match didn't advance the input, we need to do this ourselves,
     * advancing by a whole character to avoid splitting multi-byte characters.
     *
     * The lookup table approach is taken from RE2's own Python extension: the
     * high 4 bits of a UTF-8 lead byte determine the character's byte length.
     *
     * See https://github.com/google/re2/blob/972a15cedd008d846f1a39b2e88ce48d7f166cbd/python/_re2.cc#L46-L48
     */
    if (!input_advanced && new_input_size > 0) {
      size_t char_size = 1;

      if (p->pattern->options().encoding() == RE2::Options::EncodingUTF8) {
        char_size = "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"
            [((*c->input)[0] & 0xFF) >> 4];

        if (char_size > new_input_size) {
          char_size = new_input_size;
        }
      }

      c->input->remove_prefix(char_size);
    }

    return result;
  } else {
    return Qnil;
  }
}

#stringString

Returns a frozen copy of the text supplied when incrementally matching with Regexp#scan.

If the text was already a frozen string, returns the original.

Examples:

c = RE2::Regexp.new('(\d+)').scan("foo")
c.string #=> "foo"

Returns:



351
352
353
354
355
# File 'ext/re2/re2.cc', line 351

static VALUE re2_scanner_string(const VALUE self) {
  re2_scanner *c = unwrap_re2_scanner(self);

  return c->text;
}