Class: RE2::Set

Inherits:
Object show all
Defined in:
ext/re2/re2.cc

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeRE2::Set #initialize(anchor) ⇒ RE2::Set #initialize(anchor, options) ⇒ RE2::Set

Returns a new RE2::Set object, a collection of patterns that can be searched for simultaneously.

Overloads:

  • #initializeRE2::Set

    Returns a new RE2::Set object for unanchored patterns with the default options.

    Raises:

    • (NoMemoryError)

      if memory could not be allocated for the compiled pattern

  • #initialize(anchor) ⇒ RE2::Set

    Returns a new RE2::Set object for the specified anchor with the default options.

    Parameters:

    • anchor (Symbol)

      one of :unanchored, :anchor_start, :anchor_both

    Raises:

    • (ArgumentError)

      if anchor is not :unanchored, :anchor_start or :anchor_both

    • (NoMemoryError)

      if memory could not be allocated for the compiled pattern

  • #initialize(anchor, options) ⇒ RE2::Set

    Returns a new RE2::Set object with the specified options.

    Parameters:

    • anchor (Symbol)

      one of :unanchored, :anchor_start, :anchor_both

    • options (Hash)

      the options with which to compile the pattern

    Options Hash (options):

    • :utf8 (Boolean) — default: true

      text and pattern are UTF-8; otherwise Latin-1

    • :posix_syntax (Boolean) — default: false

      restrict regexps to POSIX egrep syntax

    • :longest_match (Boolean) — default: false

      search for longest match, not first match

    • :log_errors (Boolean) — default: true

      log syntax and execution errors to ERROR

    • :max_mem (Integer)

      approx. max memory footprint of RE2

    • :literal (Boolean) — default: false

      interpret string as literal, not regexp

    • :never_nl (Boolean) — default: false

      never match \n, even if it is in regexp

    • :case_sensitive (Boolean) — default: true

      match is case-sensitive (regexp can override with (?i) unless in posix_syntax mode)

    • :perl_classes (Boolean) — default: false

      allow Perl's \d \s \w \D \S \W when in posix_syntax mode

    • :word_boundary (Boolean) — default: false

      allow \b \B (word boundary and not) when in posix_syntax mode

    • :one_line (Boolean) — default: false

      ^ and $ only match beginning and end of text when in posix_syntax mode

    Raises:

    • (ArgumentError)

      if anchor is not one of the accepted choices

    • (NoMemoryError)

      if memory could not be allocated for the compiled pattern



1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
# File 'ext/re2/re2.cc', line 1872

static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
  VALUE anchor, options;
  re2_set *s;

  rb_scan_args(argc, argv, "02", &anchor, &options);
  TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);

  RE2::Anchor re2_anchor = RE2::UNANCHORED;

  if (!NIL_P(anchor)) {
    Check_Type(anchor, T_SYMBOL);
    ID id_anchor_arg = SYM2ID(anchor);
    if (id_anchor_arg == id_unanchored) {
      re2_anchor = RE2::UNANCHORED;
    } else if (id_anchor_arg == id_anchor_start) {
      re2_anchor = RE2::ANCHOR_START;
    } else if (id_anchor_arg == id_anchor_both) {
      re2_anchor = RE2::ANCHOR_BOTH;
    } else {
      rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both");
    }
  }

  RE2::Options re2_options;

  if (RTEST(options)) {
    parse_re2_options(&re2_options, options);
  }

  s->set = new(std::nothrow) RE2::Set(re2_options, re2_anchor);
  if (s->set == 0) {
    rb_raise(rb_eNoMemError, "not enough memory to allocate RE2::Set object");
  }

  return self;
}

Class Method Details

.match_raises_errors?Boolean

Returns whether the underlying RE2 version outputs error information from RE2::Set::Match. If not, #match will raise an error if attempting to set its :exception option to true.

Returns:

  • (Boolean)

    whether the underlying RE2 outputs error information from RE2::Set matches



1973
1974
1975
1976
1977
1978
1979
# File 'ext/re2/re2.cc', line 1973

static VALUE re2_set_match_raises_errors_p(VALUE) {
#ifdef HAVE_ERROR_INFO_ARGUMENT
  return Qtrue;
#else
  return Qfalse;
#endif
}

Instance Method Details

#add(pattern) ⇒ Integer

Adds a pattern to the set. Returns the index that will identify the pattern in the output of #match. Cannot be called after #compile has been called.

Examples:

set = RE2::Set.new
set.add("abc") #=> 0
set.add("def") #=> 1

Parameters:

  • pattern (String)

    the regex pattern

Returns:

  • (Integer)

    the index of the pattern in the set

Raises:

  • (ArgumentError)

    if called after compile or the pattern is rejected



1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
# File 'ext/re2/re2.cc', line 1922

static VALUE re2_set_add(VALUE self, VALUE pattern) {
  StringValue(pattern);

  re2_set *s;
  TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);

  /* To prevent the memory of the err string leaking when we call rb_raise,
   * take a copy of it and let it go out of scope.
   */
  char msg[100];
  int index;

  {
    std::string err;
    index = s->set->Add(
        re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), &err);
    strlcpy(msg, err.c_str(), sizeof(msg));
  }

  if (index < 0) {
    rb_raise(rb_eArgError, "str rejected by RE2::Set->Add(): %s", msg);
  }

  return INT2FIX(index);
}

#compileBoolean

Compiles a RE2::Set so it can be used to match against. Must be called after #add and before #match.

Examples:

set = RE2::Set.new
set.add("abc")
set.compile #=> true

Returns:

  • (Boolean)

    whether compilation was a success



1958
1959
1960
1961
1962
1963
# File 'ext/re2/re2.cc', line 1958

static VALUE re2_set_compile(VALUE self) {
  re2_set *s;
  TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);

  return BOOL2RUBY(s->set->Compile());
}

#match(str) ⇒ Array<Integer> #match(str, options) ⇒ Array<Integer>

Matches the given text against patterns in the set, returning an array of integer indices of the matching patterns if matched or an empty array if there are no matches.

Overloads:

  • #match(str) ⇒ Array<Integer>

    Returns an array of integer indices of patterns matching the given string (if any). Raises exceptions if there are any errors while matching.

    Examples:

    set = RE2::Set.new
    set.add("abc")
    set.add("def")
    set.compile
    set.match("abcdef") #=> [0, 1]

    Parameters:

    • str (String)

      the text to match against

    Returns:

    • (Array<Integer>)

      the indices of matching regexps

    Raises:

    • (MatchError)

      if an error occurs while matching

    • (UnsupportedError)

      if the underlying version of RE2 does not output error information

  • #match(str, options) ⇒ Array<Integer>

    Returns an array of integer indices of patterns matching the given string (if any). Raises exceptions if there are any errors while matching and the :exception option is set to true.

    Examples:

    set = RE2::Set.new
    set.add("abc")
    set.add("def")
    set.compile
    set.match("abcdef", exception: true) #=> [0, 1]

    Parameters:

    • str (String)

      the text to match against

    • options (Hash)

      the options with which to match

    Options Hash (options):

    • :exception (Boolean) — default: true

      whether to raise exceptions with RE2's error information (not supported on ABI version 0 of RE2)

    Returns:

    • (Array<Integer>)

      the indices of matching regexps

    Raises:

    • (MatchError)

      if an error occurs while matching

    • (UnsupportedError)

      if the underlying version of RE2 does not output error information

Returns:

  • (Array<Integer>)


2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
# File 'ext/re2/re2.cc', line 2021

static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
  VALUE str, options;
  bool raise_exception = true;
  rb_scan_args(argc, argv, "11", &str, &options);

  StringValue(str);
  re2_set *s;
  TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);

  if (RTEST(options)) {
    Check_Type(options, T_HASH);

    VALUE exception_option = rb_hash_aref(options, ID2SYM(id_exception));
    if (!NIL_P(exception_option)) {
      raise_exception = RTEST(exception_option);
    }
  }

  std::vector<int> v;

  if (raise_exception) {
#ifdef HAVE_ERROR_INFO_ARGUMENT
    RE2::Set::ErrorInfo e;
    bool match_failed = !s->set->Match(
        re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v, &e);
    VALUE result = rb_ary_new2(v.size());

    if (match_failed) {
      switch (e.kind) {
        case RE2::Set::kNoError:
          break;
        case RE2::Set::kNotCompiled:
          rb_raise(re2_eSetMatchError, "#match must not be called before #compile");
        case RE2::Set::kOutOfMemory:
          rb_raise(re2_eSetMatchError, "The DFA ran out of memory");
        case RE2::Set::kInconsistent:
          rb_raise(re2_eSetMatchError, "RE2::Prog internal error");
        default:  // Just in case a future version of libre2 adds new ErrorKinds
          rb_raise(re2_eSetMatchError, "Unknown RE2::Set::ErrorKind: %d", e.kind);
      }
    } else {
      for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
        rb_ary_push(result, INT2FIX(v[i]));
      }
    }

    return result;
#else
    rb_raise(re2_eSetUnsupportedError, "current version of RE2::Set::Match() does not output error information, :exception option can only be set to false");
#endif
  } else {
    bool matched = s->set->Match(
        re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v);
    VALUE result = rb_ary_new2(v.size());

    if (matched) {
      for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
        rb_ary_push(result, INT2FIX(v[i]));
      }
    }

    return result;
  }
}