Tests: Cache results for exp backtracking check (#3356)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
diff --git a/tests/pattern-tests.js b/tests/pattern-tests.js
index 91ce21e..72bb965 100644
--- a/tests/pattern-tests.js
+++ b/tests/pattern-tests.js
@@ -8,7 +8,7 @@ const TestCase = require('./helper/test-case');
const { BFS, BFSPathToPrismTokenPath, parseRegex } = require('./helper/util');
const { languages } = require('../components.json');
const { visitRegExpAST } = require('regexpp');
-const { transform, combineTransformers, getIntersectionWordSets, JS, Words, NFA, Transformers } = require('refa');
+const { transform, combineTransformers, getIntersectionWordSets, JS, Words, NFA, Transformers, isDisjointWith } = require('refa');
const scslre = require('scslre');
const { argv } = require('yargs');
const RAA = require('regexp-ast-analysis');
@@ -461,6 +461,50 @@ const transformer = combineTransformers([
]);
+/** @type {Map<string, Map<string, Error | null>>} */
+const resultCache = new Map();
+/**
+ * @param {string} cacheName
+ * @returns {Map<string, Error | null>}
+ */
+function getResultCache(cacheName) {
+ let cache = resultCache.get(cacheName);
+ if (cache === undefined) {
+ resultCache.set(cacheName, cache = new Map());
+ }
+ return cache;
+}
+/**
+ * @param {string} cacheName
+ * @param {T} cacheKey
+ * @param {(node: T) => void} compute
+ * @returns {void}
+ * @template {import('regexpp/ast').Node} T
+ */
+function withResultCache(cacheName, cacheKey, compute) {
+ const hasBackRef = RAA.hasSomeDescendant(cacheKey, n => n.type === 'Backreference');
+ if (hasBackRef) {
+ compute(cacheKey);
+ return;
+ }
+
+ const cache = getResultCache(cacheName);
+ let cached = cache.get(cacheKey.raw);
+ if (cached === undefined) {
+ try {
+ compute(cacheKey);
+ cached = null;
+ } catch (error) {
+ cached = error;
+ }
+ cache.set(cacheKey.raw, cached);
+ }
+
+ if (cached) {
+ throw cached;
+ }
+}
+
/**
* @param {string} path
* @param {RegExp} pattern
@@ -510,32 +554,34 @@ function checkExponentialBacktracking(path, pattern, ast) {
return;
}
- const alternatives = node.alternatives;
-
- const total = toNFA(alternatives[0]);
- total.withoutEmptyWord();
- for (let i = 1, l = alternatives.length; i < l; i++) {
- const a = alternatives[i];
- const current = toNFA(a);
- current.withoutEmptyWord();
-
- if (!total.isDisjointWith(current)) {
- assert.fail(`${path}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.`
- + ` This will cause exponential backtracking.`
- + `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.`
- + ` The goal is that all of its alternatives are disjoint.`
- + ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.`
- + `\n\nExample: \`(?:[ab]|\\w|::)+\``
- + `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.`
- + ` In this example, the pattern can easily be fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.`
- + `\nIn the real world, patterns can be a lot harder to fix.`
- + ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.`
- + ` A maintainer will help you.`
- + `\n\nFull pattern:\n${pattern}`);
- } else if (i !== l - 1) {
- total.union(current);
+ withResultCache('disjointAlternatives', node, () => {
+ const alternatives = node.alternatives;
+
+ const total = toNFA(alternatives[0]);
+ total.withoutEmptyWord();
+ for (let i = 1, l = alternatives.length; i < l; i++) {
+ const a = alternatives[i];
+ const current = toNFA(a);
+ current.withoutEmptyWord();
+
+ if (!isDisjointWith(total, current)) {
+ assert.fail(`${path}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.`
+ + ` This will cause exponential backtracking.`
+ + `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.`
+ + ` The goal is that all of its alternatives are disjoint.`
+ + ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.`
+ + `\n\nExample: \`(?:[ab]|\\w|::)+\``
+ + `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.`
+ + ` In this example, the pattern can easily be fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.`
+ + `\nIn the real world, patterns can be a lot harder to fix.`
+ + ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.`
+ + ` A maintainer will help you.`
+ + `\n\nFull pattern:\n${pattern}`);
+ } else if (i !== l - 1) {
+ total.union(current);
+ }
}
- }
+ });
}
visitRegExpAST(ast.pattern, {
@@ -555,49 +601,51 @@ function checkExponentialBacktracking(path, pattern, ast) {
return; // not a group
}
- // The idea here is the following:
- //
- // We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be
- // the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w`
- // that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w`
- // is accepted by `A{m}`.
- // This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as
- // `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main
- // requirement for exponential backtracking.
- //
- // This is actually only a crude approximation for the real analysis that would have to be done. We
- // would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most
- // cases, the approximation is good enough.
-
- const nfa = toNFA(node.element);
- nfa.withoutEmptyWord();
- const twoStar = nfa.copy();
- twoStar.quantify(2, Infinity);
-
- if (!nfa.isDisjointWith(twoStar)) {
- const word = Words.pickMostReadableWord(firstOf(getIntersectionWordSets(nfa, twoStar)));
- const example = Words.fromUnicodeToString(word);
- assert.fail(`${path}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.`
- + ` This will cause exponential backtracking.`
- + `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.`
- + ` The goal is modify E such that it is disjoint with repetitions of itself.`
- + ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.`
- + `\n\nExample 1: \`(?:\\w+|::)+\``
- + `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.`
- + ` In this example, the pattern can easily be fixed by changing \`\\w+\` to \`\\w\`.`
- + `\nExample 2: \`(?:\\w|Foo)+\``
- + `\nThe problem lies in \`\\w\` and \`Foo\` because the string "Foo" can be matched as either repeating \`\\w\` 3 times or by using the \`Foo\` alternative once.`
- + ` In this example, the pattern can easily be fixed because the \`Foo\` alternative is redundant can can be removed.`
- + `\nExample 3: \`(?:\\.\\w+(?:<.*?>)?)+\``
- + `\nThe problem lies in \`<.*?>\`. The string ".a<>.a<>" can be matched as either \`\\. \\w < . . . . >\` or \`\\. \\w < > \\. \\w < >\`.`
- + ` When it comes to exponential backtracking, it doesn't matter whether a quantifier is greedy or lazy.`
- + ` This means that the lazy \`.*?\` can jump over \`>\`.`
- + ` In this example, the pattern can easily be fixed because we just have to prevent \`.*?\` jumping over \`>\`.`
- + ` This can done by replacing \`<.*?>\` with \`<[^\\r\\n>]*>\`.`
- + `\n\nIn the real world, patterns can be a lot harder to fix.`
- + ` If you are trying to make this test pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway, a maintainer will help you.`
- + `\n\nFull pattern:\n${pattern}`);
- }
+ withResultCache('2star', node, () => {
+ // The idea here is the following:
+ //
+ // We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be
+ // the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w`
+ // that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w`
+ // is accepted by `A{m}`.
+ // This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as
+ // `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main
+ // requirement for exponential backtracking.
+ //
+ // This is actually only a crude approximation for the real analysis that would have to be done. We
+ // would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most
+ // cases, the approximation is good enough.
+
+ const nfa = toNFA(node.element);
+ nfa.withoutEmptyWord();
+ const twoStar = nfa.copy();
+ twoStar.quantify(2, Infinity);
+
+ if (!isDisjointWith(nfa, twoStar)) {
+ const word = Words.pickMostReadableWord(firstOf(getIntersectionWordSets(nfa, twoStar)));
+ const example = Words.fromUnicodeToString(word);
+ assert.fail(`${path}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.`
+ + ` This will cause exponential backtracking.`
+ + `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.`
+ + ` The goal is modify E such that it is disjoint with repetitions of itself.`
+ + ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.`
+ + `\n\nExample 1: \`(?:\\w+|::)+\``
+ + `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.`
+ + ` In this example, the pattern can easily be fixed by changing \`\\w+\` to \`\\w\`.`
+ + `\nExample 2: \`(?:\\w|Foo)+\``
+ + `\nThe problem lies in \`\\w\` and \`Foo\` because the string "Foo" can be matched as either repeating \`\\w\` 3 times or by using the \`Foo\` alternative once.`
+ + ` In this example, the pattern can easily be fixed because the \`Foo\` alternative is redundant can can be removed.`
+ + `\nExample 3: \`(?:\\.\\w+(?:<.*?>)?)+\``
+ + `\nThe problem lies in \`<.*?>\`. The string ".a<>.a<>" can be matched as either \`\\. \\w < . . . . >\` or \`\\. \\w < > \\. \\w < >\`.`
+ + ` When it comes to exponential backtracking, it doesn't matter whether a quantifier is greedy or lazy.`
+ + ` This means that the lazy \`.*?\` can jump over \`>\`.`
+ + ` In this example, the pattern can easily be fixed because we just have to prevent \`.*?\` jumping over \`>\`.`
+ + ` This can done by replacing \`<.*?>\` with \`<[^\\r\\n>]*>\`.`
+ + `\n\nIn the real world, patterns can be a lot harder to fix.`
+ + ` If you are trying to make this test pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway, a maintainer will help you.`
+ + `\n\nFull pattern:\n${pattern}`);
+ }
+ });
},
});