Hash :
46ac1afa
Author :
Date :
2025-06-26T22:46:40
kwset: Add specification comments in .h file. * lib/kwset.h (kwsalloc, kwsincr, kwsprep, kwsexec, kwsfree): Move specification to here... * lib/kwset.c (kwsalloc, kwsincr, kwsprep, kwsexec, kwsfree): ...from here.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924
/* kwset.c - search for any of a set of keywords.
Copyright (C) 1989, 1998, 2000, 2005, 2007, 2009-2025 Free Software
Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
/* Written August 1989 by Mike Haertel. */
/* For more on the Aho-Corasick and Boyer-Moore algorithms,
as well as other algorithms that might help improve performance,
see the grep manual's "Performance" chapter. */
#include <config.h>
#include "kwset.h"
#include <limits.h>
#include <stdckdint.h>
#include <stdint.h>
#include <sys/types.h>
#include "memchr2.h"
#include "minmax.h"
#include "obstack.h"
#include "xalloc.h"
#include "verify.h"
#define obstack_chunk_alloc xmalloc
#define obstack_chunk_free free
/* When we deliberately use duplicate branches, use this macro to
disable gcc's -Wduplicated-branches in the containing expression. */
#if 7 <= __GNUC__
# define IGNORE_DUPLICATE_BRANCH_WARNING(exp) \
({ \
_Pragma ("GCC diagnostic push") \
_Pragma ("GCC diagnostic ignored \"-Wduplicated-branches\"") \
exp; \
_Pragma ("GCC diagnostic pop") \
})
#else
# define IGNORE_DUPLICATE_BRANCH_WARNING(exp) exp
#endif
enum { NCHAR = UCHAR_MAX + 1 };
/* Convert a possibly-signed character to an unsigned character. This is
a bit safer than casting to unsigned char, since it catches some type
errors that the cast doesn't. */
static inline unsigned char
to_uchar (char ch)
{
return ch;
}
static unsigned char
U (char ch)
{
return to_uchar (ch);
}
/* Balanced tree of edges and labels leaving a given trie node. */
struct tree
{
struct tree *llink; /* Left link; MUST be first field. */
struct tree *rlink; /* Right link (to larger labels). */
struct trie *trie; /* Trie node pointed to by this edge. */
unsigned char label; /* Label on this edge. */
char balance; /* Difference in depths of subtrees. */
};
/* Node of a trie representing a set of keywords. */
struct trie
{
/* If an accepting node, this is either 2*W + 1 where W is the word
index, or is -1 if Aho-Corasick is in use and FAIL
specifies where to look for more info. If not an accepting node,
this is zero. */
ptrdiff_t accepting;
struct tree *links; /* Tree of edges leaving this node. */
struct trie *parent; /* Parent of this node. */
struct trie *next; /* List of all trie nodes in level order. */
struct trie *fail; /* Aho-Corasick failure function. */
idx_t depth; /* Depth of this node from the root. */
idx_t shift; /* Shift function for search failures. */
idx_t maxshift; /* Max shift of self and descendants. */
};
/* Structure returned opaquely to the caller, containing everything. */
struct kwset
{
struct obstack obstack; /* Obstack for node allocation. */
idx_t words; /* Number of words in the trie. */
struct trie *trie; /* The trie itself. */
idx_t mind; /* Minimum depth of an accepting node. */
unsigned char delta[NCHAR]; /* Delta table for rapid search. */
struct trie *next[NCHAR]; /* Table of children of the root. */
char *target; /* Target string if there's only one. */
idx_t *shift; /* Used in Boyer-Moore search for one
string. */
char const *trans; /* Character translation table. */
/* This helps to match a terminal byte, which is the first byte
for Aho-Corasick, and the last byte for Boyer-More. If all the
patterns have the same terminal byte (after translation via TRANS
if TRANS is nonnull), then this is that byte as an unsigned char.
Otherwise this is -1 if there is disagreement among the strings
about terminal bytes, and -2 if there are no terminal bytes and
no disagreement because all the patterns are empty. */
int gc1;
/* This helps to match a terminal byte. If 0 <= GC1HELP, B is
terminal when B == GC1 || B == GC1HELP (note that GC1 == GCHELP
is common here). This is typically faster than evaluating
to_uchar (TRANS[B]) == GC1. */
int gc1help;
/* If the string has two or more bytes, this is the penultimate byte,
after translation via TRANS if TRANS is nonnull. This variable
is used only by Boyer-Moore. */
char gc2;
/* kwsexec implementation. */
ptrdiff_t (*kwsexec) (kwset_t, char const *, idx_t, struct kwsmatch *, bool);
};
/* Use TRANS to transliterate C. A null TRANS does no transliteration. */
static inline char
tr (char const *trans, char c)
{
return trans ? trans[U(c)] : c;
}
static ptrdiff_t acexec (kwset_t, char const *, idx_t,
struct kwsmatch *, bool);
static ptrdiff_t bmexec (kwset_t, char const *, idx_t,
struct kwsmatch *, bool);
kwset_t
kwsalloc (char const *trans)
{
struct kwset *kwset = xmalloc (sizeof *kwset);
obstack_init (&kwset->obstack);
kwset->words = 0;
kwset->trie = obstack_alloc (&kwset->obstack, sizeof *kwset->trie);
kwset->trie->accepting = 0;
kwset->trie->links = nullptr;
kwset->trie->parent = nullptr;
kwset->trie->next = nullptr;
kwset->trie->fail = nullptr;
kwset->trie->depth = 0;
kwset->trie->shift = 0;
kwset->mind = IDX_MAX;
kwset->target = nullptr;
kwset->trans = trans;
kwset->kwsexec = acexec;
return kwset;
}
/* This upper bound is valid for CHAR_BIT >= 4 and
exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */
enum { DEPTH_SIZE = CHAR_BIT + CHAR_BIT / 2 };
void
kwsincr (kwset_t kwset, char const *text, idx_t len)
{
assume (0 <= len);
struct trie *trie = kwset->trie;
char const *trans = kwset->trans;
bool reverse = kwset->kwsexec == bmexec;
if (reverse)
text += len;
/* Descend the trie (built of keywords) character-by-character,
installing new nodes when necessary. */
while (len--)
{
unsigned char uc = reverse ? *--text : *text++;
unsigned char label = trans ? trans[uc] : uc;
/* Descend the tree of outgoing links for this trie node,
looking for the current character and keeping track
of the path followed. */
struct tree *cur = trie->links;
struct tree *links[DEPTH_SIZE];
enum { L, R } dirs[DEPTH_SIZE];
links[0] = (struct tree *) &trie->links;
dirs[0] = L;
idx_t depth = 1;
while (cur && label != cur->label)
{
links[depth] = cur;
if (label < cur->label)
dirs[depth++] = L, cur = cur->llink;
else
dirs[depth++] = R, cur = cur->rlink;
}
/* The current character doesn't have an outgoing link at
this trie node, so build a new trie node and install
a link in the current trie node's tree. */
if (!cur)
{
cur = obstack_alloc (&kwset->obstack, sizeof *cur);
cur->llink = nullptr;
cur->rlink = nullptr;
cur->trie = obstack_alloc (&kwset->obstack, sizeof *cur->trie);
cur->trie->accepting = 0;
cur->trie->links = nullptr;
cur->trie->parent = trie;
cur->trie->next = nullptr;
cur->trie->fail = nullptr;
cur->trie->depth = trie->depth + 1;
cur->trie->shift = 0;
cur->label = label;
cur->balance = 0;
/* Install the new tree node in its parent. */
if (dirs[--depth] == L)
links[depth]->llink = cur;
else
links[depth]->rlink = cur;
/* Back up the tree fixing the balance flags. */
while (depth && !links[depth]->balance)
{
if (dirs[depth] == L)
--links[depth]->balance;
else
++links[depth]->balance;
--depth;
}
/* Rebalance the tree by pointer rotations if necessary. */
if (depth && ((dirs[depth] == L && --links[depth]->balance)
|| (dirs[depth] == R && ++links[depth]->balance)))
{
struct tree *t, *r, *l, *rl, *lr;
switch (links[depth]->balance)
{
case (char) -2:
switch (dirs[depth + 1])
{
case L:
r = links[depth], t = r->llink, rl = t->rlink;
t->rlink = r, r->llink = rl;
t->balance = r->balance = 0;
break;
case R:
r = links[depth], l = r->llink, t = l->rlink;
rl = t->rlink, lr = t->llink;
t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
l->balance = t->balance != 1 ? 0 : -1;
r->balance = t->balance != (char) -1 ? 0 : 1;
t->balance = 0;
break;
default:
abort ();
}
break;
case 2:
switch (dirs[depth + 1])
{
case R:
l = links[depth], t = l->rlink, lr = t->llink;
t->llink = l, l->rlink = lr;
t->balance = l->balance = 0;
break;
case L:
l = links[depth], r = l->rlink, t = r->llink;
lr = t->llink, rl = t->rlink;
t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
l->balance = t->balance != 1 ? 0 : -1;
r->balance = t->balance != (char) -1 ? 0 : 1;
t->balance = 0;
break;
default:
abort ();
}
break;
default:
abort ();
}
if (dirs[depth - 1] == L)
links[depth - 1]->llink = t;
else
links[depth - 1]->rlink = t;
}
}
trie = cur->trie;
}
/* Mark the node finally reached as accepting, encoding the
index number of this word in the keyword set so far. */
if (!trie->accepting)
trie->accepting = 2 * kwset->words + 1;
++kwset->words;
/* Keep track of the longest and shortest string of the keyword set. */
if (trie->depth < kwset->mind)
kwset->mind = trie->depth;
}
idx_t
kwswords (kwset_t kwset)
{
return kwset->words;
}
/* Enqueue the trie nodes referenced from the given tree in the
given queue. */
static void
enqueue (struct tree *tree, struct trie **last)
{
if (!tree)
return;
enqueue (tree->llink, last);
enqueue (tree->rlink, last);
(*last) = (*last)->next = tree->trie;
}
/* Compute the Aho-Corasick failure function for the trie nodes referenced
from the given tree, given the failure function for their parent as
well as a last resort failure node. */
static void
treefails (struct tree const *tree, struct trie const *fail,
struct trie *recourse, bool reverse)
{
struct tree *cur;
if (!tree)
return;
treefails (tree->llink, fail, recourse, reverse);
treefails (tree->rlink, fail, recourse, reverse);
/* Find, in the chain of fails going back to the root, the first
node that has a descendant on the current label. */
while (fail)
{
cur = fail->links;
while (cur && tree->label != cur->label)
if (tree->label < cur->label)
cur = cur->llink;
else
cur = cur->rlink;
if (cur)
{
tree->trie->fail = cur->trie;
if (!reverse && cur->trie->accepting && !tree->trie->accepting)
tree->trie->accepting = -1;
return;
}
fail = fail->fail;
}
tree->trie->fail = recourse;
}
/* Set delta entries for the links of the given tree such that
the preexisting delta value is larger than the current depth. */
static void
treedelta (struct tree const *tree, idx_t depth, unsigned char delta[])
{
if (!tree)
return;
treedelta (tree->llink, depth, delta);
treedelta (tree->rlink, depth, delta);
if (depth < delta[tree->label])
delta[tree->label] = depth;
}
/* Return true if A has every label in B. */
static bool _GL_ATTRIBUTE_PURE
hasevery (struct tree const *a, struct tree const *b)
{
if (!b)
return true;
if (!hasevery (a, b->llink))
return false;
if (!hasevery (a, b->rlink))
return false;
while (a && b->label != a->label)
if (b->label < a->label)
a = a->llink;
else
a = a->rlink;
return !!a;
}
/* Compute a vector, indexed by character code, of the trie nodes
referenced from the given tree. */
static void
treenext (struct tree const *tree, struct trie *next[])
{
if (!tree)
return;
treenext (tree->llink, next);
treenext (tree->rlink, next);
next[tree->label] = tree->trie;
}
void
kwsprep (kwset_t kwset)
{
char const *trans = kwset->trans;
unsigned char deltabuf[NCHAR];
unsigned char *delta = trans ? deltabuf : kwset->delta;
struct trie *curr, *last;
/* Use Boyer-Moore if just one pattern, Aho-Corasick otherwise. */
bool reverse = kwset->words == 1;
if (reverse)
{
kwset_t new_kwset;
/* Enqueue the immediate descendants in the level order queue. */
for (curr = last = kwset->trie; curr; curr = curr->next)
enqueue (curr->links, &last);
/* Looking for just one string. Extract it from the trie. */
kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
curr = kwset->trie;
for (idx_t i = 0; i < kwset->mind; i++)
{
kwset->target[i] = curr->links->label;
curr = curr->next;
}
new_kwset = kwsalloc (kwset->trans);
new_kwset->kwsexec = bmexec;
kwsincr (new_kwset, kwset->target, kwset->mind);
obstack_free (&kwset->obstack, nullptr);
*kwset = *new_kwset;
free (new_kwset);
}
/* Initial values for the delta table; will be changed later. The
delta entry for a given character is the smallest depth of any
node at which an outgoing edge is labeled by that character. */
memset (delta, MIN (kwset->mind, UCHAR_MAX), sizeof deltabuf);
/* Traverse the nodes of the trie in level order, simultaneously
computing the delta table, failure function, and shift function. */
for (curr = last = kwset->trie; curr; curr = curr->next)
{
/* Enqueue the immediate descendants in the level order queue. */
enqueue (curr->links, &last);
/* Update the delta table for the descendants of this node. */
treedelta (curr->links, curr->depth, delta);
/* Compute the failure function for the descendants of this node. */
treefails (curr->links, curr->fail, kwset->trie, reverse);
if (reverse)
{
curr->shift = kwset->mind;
curr->maxshift = kwset->mind;
/* Update the shifts at each node in the current node's chain
of fails back to the root. */
struct trie *fail;
for (fail = curr->fail; fail; fail = fail->fail)
{
/* If the current node has some outgoing edge that the fail
doesn't, then the shift at the fail should be no larger
than the difference of their depths. */
if (!hasevery (fail->links, curr->links))
if (curr->depth - fail->depth < fail->shift)
fail->shift = curr->depth - fail->depth;
/* If the current node is accepting then the shift at the
fail and its descendants should be no larger than the
difference of their depths. */
if (curr->accepting && fail->maxshift > curr->depth - fail->depth)
fail->maxshift = curr->depth - fail->depth;
}
}
}
if (reverse)
{
/* Traverse the trie in level order again, fixing up all nodes whose
shift exceeds their inherited maxshift. */
for (curr = kwset->trie->next; curr; curr = curr->next)
{
if (curr->maxshift > curr->parent->maxshift)
curr->maxshift = curr->parent->maxshift;
if (curr->shift > curr->maxshift)
curr->shift = curr->maxshift;
}
}
/* Create a vector, indexed by character code, of the outgoing links
from the root node. Accumulate GC1 and GC1HELP. */
struct trie *nextbuf[NCHAR];
struct trie **next = trans ? nextbuf : kwset->next;
memset (next, 0, sizeof nextbuf);
treenext (kwset->trie->links, next);
int gc1 = -2;
int gc1help = -1;
for (int i = 0; i < NCHAR; i++)
{
int ti = i;
if (trans)
{
ti = U(trans[i]);
kwset->next[i] = next[ti];
}
if (kwset->next[i])
{
if (gc1 < -1)
{
gc1 = ti;
gc1help = i;
}
else if (gc1 == ti)
gc1help = gc1help == ti ? i : -1;
else if (i == ti && gc1 == gc1help)
gc1help = i;
else
gc1 = -1;
}
}
kwset->gc1 = gc1;
kwset->gc1help = gc1help;
if (reverse)
{
/* Looking for just one string. Extract it from the trie. */
kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
curr = kwset->trie;
for (idx_t i = kwset->mind; 0 < i; i--)
{
kwset->target[i - 1] = curr->links->label;
curr = curr->next;
}
if (kwset->mind > 1)
{
/* Looking for the delta2 shift that might be made after a
backwards match has failed. Extract it from the trie. */
kwset->shift
= obstack_alloc (&kwset->obstack,
sizeof *kwset->shift * (kwset->mind - 1));
curr = kwset->trie->next;
for (idx_t i = 0; i < kwset->mind - 1; i++)
{
kwset->shift[i] = curr->shift;
curr = curr->next;
}
/* The penultimate byte. */
kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]);
}
}
/* Fix things up for any translation table. */
if (trans)
for (int i = 0; i < NCHAR; ++i)
kwset->delta[i] = delta[U(trans[i])];
}
/* Delta2 portion of a Boyer-Moore search. *TP is the string text
pointer; it is updated in place. EP is the end of the string text,
and SP the end of the pattern. LEN is the pattern length; it must
be at least 2. TRANS, if nonnull, is the input translation table.
GC1 and GC2 are the last and second-from last bytes of the pattern,
transliterated by TRANS; the caller precomputes them for
efficiency. If D1 is nonnull, it is a delta1 table for shifting *TP
when failing. KWSET->shift says how much to shift. */
static inline bool
bm_delta2_search (char const **tpp, char const *ep, char const *sp,
idx_t len,
char const *trans, char gc1, char gc2,
unsigned char const *d1, kwset_t kwset)
{
char const *tp = *tpp;
idx_t d = len, skip = 0;
while (true)
{
idx_t i = 2;
if (tr (trans, tp[-2]) == gc2)
{
while (++i <= d)
if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
break;
if (i > d)
{
for (i = d + skip + 1; i <= len; ++i)
if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
break;
if (i > len)
{
*tpp = tp - len;
return true;
}
}
}
tp += d = kwset->shift[i - 2];
if (tp > ep)
break;
if (tr (trans, tp[-1]) != gc1)
{
if (d1)
tp += d1[U(tp[-1])];
break;
}
skip = i - 1;
}
*tpp = tp;
return false;
}
/* Return the address of the first byte in the buffer S (of size N)
that matches the terminal byte specified by KWSET, or null if there
is no match. KWSET->gc1 should be nonnegative. */
static char const *
memchr_kwset (char const *s, idx_t n, kwset_t kwset)
{
char const *slim = s + n;
if (kwset->gc1help < 0)
{
for (; s < slim; s++)
if (kwset->next[U(*s)])
return s;
}
else
{
int small_heuristic = 2;
idx_t small_bytes = small_heuristic * sizeof (unsigned long int);
while (s < slim)
{
if (kwset->next[U(*s)])
return s;
s++;
if ((uintptr_t) s % small_bytes == 0)
return memchr2 (s, kwset->gc1, kwset->gc1help, slim - s);
}
}
return nullptr;
}
/* Fast Boyer-Moore search (inlinable version). */
static inline ptrdiff_t _GL_ATTRIBUTE_PURE
bmexec_trans (kwset_t kwset, char const *text, idx_t size)
{
assume (0 <= size);
unsigned char const *d1;
char const *ep, *sp, *tp;
int d;
idx_t len = kwset->mind;
char const *trans = kwset->trans;
if (len == 0)
return 0;
if (len > size)
return -1;
if (len == 1)
{
tp = memchr_kwset (text, size, kwset);
return tp ? tp - text : -1;
}
d1 = kwset->delta;
sp = kwset->target + len;
tp = text + len;
char gc1 = kwset->gc1;
char gc2 = kwset->gc2;
/* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
idx_t len12;
if (!ckd_mul (&len12, len, 12) && len12 < size)
/* 11 is not a bug, the initial offset happens only once. */
for (ep = text + size - 11 * len; tp <= ep; )
{
char const *tp0 = tp;
d = d1[U(tp[-1])], tp += d;
d = d1[U(tp[-1])], tp += d;
if (d != 0)
{
d = d1[U(tp[-1])], tp += d;
d = d1[U(tp[-1])], tp += d;
d = d1[U(tp[-1])], tp += d;
if (d != 0)
{
d = d1[U(tp[-1])], tp += d;
d = d1[U(tp[-1])], tp += d;
d = d1[U(tp[-1])], tp += d;
if (d != 0)
{
d = d1[U(tp[-1])], tp += d;
d = d1[U(tp[-1])], tp += d;
/* As a heuristic, prefer memchr to seeking by
delta1 when the latter doesn't advance much. */
int advance_heuristic = 16 * sizeof (long);
if (advance_heuristic <= tp - tp0)
continue;
tp--;
tp = memchr_kwset (tp, text + size - tp, kwset);
if (! tp)
return -1;
tp++;
if (ep <= tp)
break;
}
}
}
if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset))
return tp - text;
}
/* Now only a few characters are left to search. Carefully avoid
ever producing an out-of-bounds pointer. */
ep = text + size;
d = d1[U(tp[-1])];
while (d <= ep - tp)
{
d = d1[U((tp += d)[-1])];
if (d != 0)
continue;
if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, nullptr, kwset))
return tp - text;
}
return -1;
}
/* Fast Boyer-Moore search. */
static ptrdiff_t
bmexec (kwset_t kwset, char const *text, idx_t size,
struct kwsmatch *kwsmatch, bool longest)
{
/* Help the compiler inline in two ways, depending on whether
kwset->trans is null. */
ptrdiff_t ret = (IGNORE_DUPLICATE_BRANCH_WARNING
(kwset->trans
? bmexec_trans (kwset, text, size)
: bmexec_trans (kwset, text, size)));
kwsmatch->index = 0;
kwsmatch->offset = ret;
kwsmatch->size = kwset->mind;
return ret;
}
/* Hairy multiple string search with the Aho-Corasick algorithm.
(inlinable version) */
static inline ptrdiff_t
acexec_trans (kwset_t kwset, char const *text, idx_t len,
struct kwsmatch *kwsmatch, bool longest)
{
struct trie const *trie, *accept;
char const *tp, *left, *lim;
struct tree const *tree;
char const *trans;
/* Initialize register copies and look for easy ways out. */
if (len < kwset->mind)
return -1;
trans = kwset->trans;
trie = kwset->trie;
lim = text + len;
tp = text;
if (!trie->accepting)
{
unsigned char c;
int gc1 = kwset->gc1;
while (true)
{
if (gc1 < 0)
{
while (! (trie = kwset->next[c = tr (trans, *tp++)]))
if (tp >= lim)
return -1;
}
else
{
tp = memchr_kwset (tp, lim - tp, kwset);
if (!tp)
return -1;
c = tr (trans, *tp++);
trie = kwset->next[c];
}
while (true)
{
if (trie->accepting)
goto match;
if (tp >= lim)
return -1;
c = tr (trans, *tp++);
for (tree = trie->links; c != tree->label; )
{
tree = c < tree->label ? tree->llink : tree->rlink;
if (! tree)
{
trie = trie->fail;
if (!trie)
{
trie = kwset->next[c];
if (trie)
goto have_trie;
if (tp >= lim)
return -1;
goto next_c;
}
if (trie->accepting)
{
--tp;
goto match;
}
tree = trie->links;
}
}
trie = tree->trie;
have_trie:;
}
next_c:;
}
}
match:
accept = trie;
while (accept->accepting < 0)
accept = accept->fail;
left = tp - accept->depth;
/* Try left-most longest match. */
if (longest)
{
while (tp < lim)
{
struct trie const *accept1;
char const *left1;
unsigned char c = tr (trans, *tp++);
do
{
tree = trie->links;
while (tree && c != tree->label)
tree = c < tree->label ? tree->llink : tree->rlink;
}
while (!tree && (trie = trie->fail) && accept->depth <= trie->depth);
if (!tree)
break;
trie = tree->trie;
if (trie->accepting)
{
accept1 = trie;
while (accept1->accepting < 0)
accept1 = accept1->fail;
left1 = tp - accept1->depth;
if (left1 <= left)
{
left = left1;
accept = accept1;
}
}
}
}
kwsmatch->index = accept->accepting >> 1;
kwsmatch->offset = left - text;
kwsmatch->size = accept->depth;
return left - text;
}
/* Hairy multiple string search with Aho-Corasick algorithm. */
static ptrdiff_t
acexec (kwset_t kwset, char const *text, idx_t size,
struct kwsmatch *kwsmatch, bool longest)
{
assume (0 <= size);
/* Help the compiler inline in two ways, depending on whether
kwset->trans is null. */
return (IGNORE_DUPLICATE_BRANCH_WARNING
(kwset->trans
? acexec_trans (kwset, text, size, kwsmatch, longest)
: acexec_trans (kwset, text, size, kwsmatch, longest)));
}
ptrdiff_t
kwsexec (kwset_t kwset, char const *text, idx_t size,
struct kwsmatch *kwsmatch, bool longest)
{
return kwset->kwsexec (kwset, text, size, kwsmatch, longest);
}
void
kwsfree (kwset_t kwset)
{
obstack_free (&kwset->obstack, nullptr);
free (kwset);
}