StrSearcher: Additional comments and small code moves
Break out a separate static method to create the "byteset".
This commit is contained in:
@@ -641,6 +641,8 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
|
|||||||
}
|
}
|
||||||
StrSearcherImpl::TwoWay(ref mut searcher) => {
|
StrSearcherImpl::TwoWay(ref mut searcher) => {
|
||||||
let is_long = searcher.memory == usize::MAX;
|
let is_long = searcher.memory == usize::MAX;
|
||||||
|
// write out `true` and `false` cases to encourage the compiler
|
||||||
|
// to specialize the two cases separately.
|
||||||
if is_long {
|
if is_long {
|
||||||
searcher.next::<MatchOnly>(self.haystack.as_bytes(),
|
searcher.next::<MatchOnly>(self.haystack.as_bytes(),
|
||||||
self.needle.as_bytes(),
|
self.needle.as_bytes(),
|
||||||
@@ -653,8 +655,8 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
|
unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn next_back(&mut self) -> SearchStep {
|
fn next_back(&mut self) -> SearchStep {
|
||||||
@@ -709,6 +711,7 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
|
|||||||
}
|
}
|
||||||
StrSearcherImpl::TwoWay(ref mut searcher) => {
|
StrSearcherImpl::TwoWay(ref mut searcher) => {
|
||||||
let is_long = searcher.memory == usize::MAX;
|
let is_long = searcher.memory == usize::MAX;
|
||||||
|
// write out `true` and `false`, like `next_match`
|
||||||
if is_long {
|
if is_long {
|
||||||
searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
|
searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
|
||||||
self.needle.as_bytes(),
|
self.needle.as_bytes(),
|
||||||
@@ -723,8 +726,7 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The internal state of an iterator that searches for matches of a substring
|
/// The internal state of the two-way substring search algorithm.
|
||||||
/// within a larger string using two-way search
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
struct TwoWaySearcher {
|
struct TwoWaySearcher {
|
||||||
// constants
|
// constants
|
||||||
@@ -741,7 +743,9 @@ struct TwoWaySearcher {
|
|||||||
// variables
|
// variables
|
||||||
position: usize,
|
position: usize,
|
||||||
end: usize,
|
end: usize,
|
||||||
|
/// index into needle before which we have already matched
|
||||||
memory: usize,
|
memory: usize,
|
||||||
|
/// index into needle after which we have already matched
|
||||||
memory_back: usize,
|
memory_back: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -841,9 +845,6 @@ impl TwoWaySearcher {
|
|||||||
// is large.
|
// is large.
|
||||||
if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
|
if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
|
||||||
// short period case -- the period is exact
|
// short period case -- the period is exact
|
||||||
let byteset = needle[..period].iter()
|
|
||||||
.fold(0, |a, &b| (1 << (b & 0x3f)) | a);
|
|
||||||
|
|
||||||
// compute a separate critical factorization for the reversed needle
|
// compute a separate critical factorization for the reversed needle
|
||||||
// x = u' v' where |v'| < period(x).
|
// x = u' v' where |v'| < period(x).
|
||||||
//
|
//
|
||||||
@@ -860,26 +861,26 @@ impl TwoWaySearcher {
|
|||||||
crit_pos: crit_pos,
|
crit_pos: crit_pos,
|
||||||
crit_pos_back: crit_pos_back,
|
crit_pos_back: crit_pos_back,
|
||||||
period: period,
|
period: period,
|
||||||
byteset: byteset,
|
byteset: Self::byteset_create(&needle[..period]),
|
||||||
|
|
||||||
position: 0,
|
position: 0,
|
||||||
end: end,
|
end: end,
|
||||||
memory: 0,
|
memory: 0,
|
||||||
// memory_back after which we have already matched
|
|
||||||
memory_back: needle.len(),
|
memory_back: needle.len(),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// long period case -- we have an approximation to the actual period,
|
// long period case -- we have an approximation to the actual period,
|
||||||
// and don't use memorization.
|
// and don't use memorization.
|
||||||
|
//
|
||||||
let byteset = needle.iter()
|
// Approximate the period by lower bound max(|u|, |v|) + 1.
|
||||||
.fold(0, |a, &b| (1 << (b & 0x3f)) | a);
|
// The critical factorization is efficient to use for both forward and
|
||||||
|
// reverse search.
|
||||||
|
|
||||||
TwoWaySearcher {
|
TwoWaySearcher {
|
||||||
crit_pos: crit_pos,
|
crit_pos: crit_pos,
|
||||||
crit_pos_back: crit_pos,
|
crit_pos_back: crit_pos,
|
||||||
period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
|
period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
|
||||||
byteset: byteset,
|
byteset: Self::byteset_create(needle),
|
||||||
|
|
||||||
position: 0,
|
position: 0,
|
||||||
end: end,
|
end: end,
|
||||||
@@ -889,6 +890,11 @@ impl TwoWaySearcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn byteset_create(bytes: &[u8]) -> u64 {
|
||||||
|
bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a)
|
||||||
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn byteset_contains(&self, byte: u8) -> bool {
|
fn byteset_contains(&self, byte: u8) -> bool {
|
||||||
(self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0
|
(self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0
|
||||||
@@ -976,9 +982,9 @@ impl TwoWaySearcher {
|
|||||||
// and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
|
// and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
|
||||||
// is a critical factorization, so is (reverse(v), reverse(u)).
|
// is a critical factorization, so is (reverse(v), reverse(u)).
|
||||||
//
|
//
|
||||||
// For the short period case, using memorization, we rely on |u| < period(x).
|
// For the reverse case we have computed a critical factorization x = u' v'
|
||||||
// For this case we have computed a critical factorization x = u' v'
|
// (field `crit_pos_back`). We need |u| < period(x) for the forward case and
|
||||||
// where |v'| < period(x) instead (field `crit_pos_back`).
|
// thus |v'| < period(x) for the reverse.
|
||||||
//
|
//
|
||||||
// To search in reverse through the haystack, we search forward through
|
// To search in reverse through the haystack, we search forward through
|
||||||
// a reversed haystack with a reversed needle, matching first u' and then v'.
|
// a reversed haystack with a reversed needle, matching first u' and then v'.
|
||||||
@@ -1070,7 +1076,8 @@ impl TwoWaySearcher {
|
|||||||
fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
|
fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
|
||||||
let mut left = 0; // Corresponds to i in the paper
|
let mut left = 0; // Corresponds to i in the paper
|
||||||
let mut right = 1; // Corresponds to j in the paper
|
let mut right = 1; // Corresponds to j in the paper
|
||||||
let mut offset = 0; // Corresponds to k in the paper
|
let mut offset = 0; // Corresponds to k in the paper, but starting at 0
|
||||||
|
// to match 0-based indexing.
|
||||||
let mut period = 1; // Corresponds to p in the paper
|
let mut period = 1; // Corresponds to p in the paper
|
||||||
|
|
||||||
while let Some(&a) = arr.get(right + offset) {
|
while let Some(&a) = arr.get(right + offset) {
|
||||||
@@ -1117,7 +1124,8 @@ impl TwoWaySearcher {
|
|||||||
{
|
{
|
||||||
let mut left = 0; // Corresponds to i in the paper
|
let mut left = 0; // Corresponds to i in the paper
|
||||||
let mut right = 1; // Corresponds to j in the paper
|
let mut right = 1; // Corresponds to j in the paper
|
||||||
let mut offset = 0; // Corresponds to k in the paper
|
let mut offset = 0; // Corresponds to k in the paper, but starting at 0
|
||||||
|
// to match 0-based indexing.
|
||||||
let mut period = 1; // Corresponds to p in the paper
|
let mut period = 1; // Corresponds to p in the paper
|
||||||
let n = arr.len();
|
let n = arr.len();
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user