Commit ead22e1e03f87319ceb20c712e6424c55276c887

Michael Schmidt 2022-03-13T10:57:32

Tests: Cache results for exp backtracking check (#3356)

diff --git a/tests/pattern-tests.js b/tests/pattern-tests.js
index 91ce21e..72bb965 100644
--- a/tests/pattern-tests.js
+++ b/tests/pattern-tests.js
@@ -8,7 +8,7 @@ const TestCase = require('./helper/test-case');
 const { BFS, BFSPathToPrismTokenPath, parseRegex } = require('./helper/util');
 const { languages } = require('../components.json');
 const { visitRegExpAST } = require('regexpp');
-const { transform, combineTransformers, getIntersectionWordSets, JS, Words, NFA, Transformers } = require('refa');
+const { transform, combineTransformers, getIntersectionWordSets, JS, Words, NFA, Transformers, isDisjointWith } = require('refa');
 const scslre = require('scslre');
 const { argv } = require('yargs');
 const RAA = require('regexp-ast-analysis');
@@ -461,6 +461,50 @@ const transformer = combineTransformers([
 ]);
 
 
+/** @type {Map<string, Map<string, Error | null>>} */
+const resultCache = new Map();
+/**
+ * @param {string} cacheName
+ * @returns {Map<string, Error | null>}
+ */
+function getResultCache(cacheName) {
+	let cache = resultCache.get(cacheName);
+	if (cache === undefined) {
+		resultCache.set(cacheName, cache = new Map());
+	}
+	return cache;
+}
+/**
+ * @param {string} cacheName
+ * @param {T} cacheKey
+ * @param {(node: T) => void} compute
+ * @returns {void}
+ * @template {import('regexpp/ast').Node} T
+ */
+function withResultCache(cacheName, cacheKey, compute) {
+	const hasBackRef = RAA.hasSomeDescendant(cacheKey, n => n.type === 'Backreference');
+	if (hasBackRef) {
+		compute(cacheKey);
+		return;
+	}
+
+	const cache = getResultCache(cacheName);
+	let cached = cache.get(cacheKey.raw);
+	if (cached === undefined) {
+		try {
+			compute(cacheKey);
+			cached = null;
+		} catch (error) {
+			cached = error;
+		}
+		cache.set(cacheKey.raw, cached);
+	}
+
+	if (cached) {
+		throw cached;
+	}
+}
+
 /**
  * @param {string} path
  * @param {RegExp} pattern
@@ -510,32 +554,34 @@ function checkExponentialBacktracking(path, pattern, ast) {
 			return;
 		}
 
-		const alternatives = node.alternatives;
-
-		const total = toNFA(alternatives[0]);
-		total.withoutEmptyWord();
-		for (let i = 1, l = alternatives.length; i < l; i++) {
-			const a = alternatives[i];
-			const current = toNFA(a);
-			current.withoutEmptyWord();
-
-			if (!total.isDisjointWith(current)) {
-				assert.fail(`${path}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.`
-					+ ` This will cause exponential backtracking.`
-					+ `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.`
-					+ ` The goal is that all of its alternatives are disjoint.`
-					+ ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.`
-					+ `\n\nExample: \`(?:[ab]|\\w|::)+\``
-					+ `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.`
-					+ ` In this example, the pattern can easily be fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.`
-					+ `\nIn the real world, patterns can be a lot harder to fix.`
-					+ ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.`
-					+ ` A maintainer will help you.`
-					+ `\n\nFull pattern:\n${pattern}`);
-			} else if (i !== l - 1) {
-				total.union(current);
+		withResultCache('disjointAlternatives', node, () => {
+			const alternatives = node.alternatives;
+
+			const total = toNFA(alternatives[0]);
+			total.withoutEmptyWord();
+			for (let i = 1, l = alternatives.length; i < l; i++) {
+				const a = alternatives[i];
+				const current = toNFA(a);
+				current.withoutEmptyWord();
+
+				if (!isDisjointWith(total, current)) {
+					assert.fail(`${path}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.`
+						+ ` This will cause exponential backtracking.`
+						+ `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.`
+						+ ` The goal is that all of its alternatives are disjoint.`
+						+ ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.`
+						+ `\n\nExample: \`(?:[ab]|\\w|::)+\``
+						+ `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.`
+						+ ` In this example, the pattern can easily be fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.`
+						+ `\nIn the real world, patterns can be a lot harder to fix.`
+						+ ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.`
+						+ ` A maintainer will help you.`
+						+ `\n\nFull pattern:\n${pattern}`);
+				} else if (i !== l - 1) {
+					total.union(current);
+				}
 			}
-		}
+		});
 	}
 
 	visitRegExpAST(ast.pattern, {
@@ -555,49 +601,51 @@ function checkExponentialBacktracking(path, pattern, ast) {
 				return; // not a group
 			}
 
-			// The idea here is the following:
-			//
-			// We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be
-			// the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w`
-			// that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w`
-			// is accepted by `A{m}`.
-			// This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as
-			// `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main
-			// requirement for exponential backtracking.
-			//
-			// This is actually only a crude approximation for the real analysis that would have to be done. We
-			// would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most
-			// cases, the approximation is good enough.
-
-			const nfa = toNFA(node.element);
-			nfa.withoutEmptyWord();
-			const twoStar = nfa.copy();
-			twoStar.quantify(2, Infinity);
-
-			if (!nfa.isDisjointWith(twoStar)) {
-				const word = Words.pickMostReadableWord(firstOf(getIntersectionWordSets(nfa, twoStar)));
-				const example = Words.fromUnicodeToString(word);
-				assert.fail(`${path}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.`
-					+ ` This will cause exponential backtracking.`
-					+ `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.`
-					+ ` The goal is modify E such that it is disjoint with repetitions of itself.`
-					+ ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.`
-					+ `\n\nExample 1: \`(?:\\w+|::)+\``
-					+ `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.`
-					+ ` In this example, the pattern can easily be fixed by changing \`\\w+\` to \`\\w\`.`
-					+ `\nExample 2: \`(?:\\w|Foo)+\``
-					+ `\nThe problem lies in \`\\w\` and \`Foo\` because the string "Foo" can be matched as either repeating \`\\w\` 3 times or by using the \`Foo\` alternative once.`
-					+ ` In this example, the pattern can easily be fixed because the \`Foo\` alternative is redundant can can be removed.`
-					+ `\nExample 3: \`(?:\\.\\w+(?:<.*?>)?)+\``
-					+ `\nThe problem lies in \`<.*?>\`. The string ".a<>.a<>" can be matched as either \`\\. \\w < . . . . >\` or \`\\. \\w < > \\. \\w < >\`.`
-					+ ` When it comes to exponential backtracking, it doesn't matter whether a quantifier is greedy or lazy.`
-					+ ` This means that the lazy \`.*?\` can jump over \`>\`.`
-					+ ` In this example, the pattern can easily be fixed because we just have to prevent \`.*?\` jumping over \`>\`.`
-					+ ` This can done by replacing \`<.*?>\` with \`<[^\\r\\n>]*>\`.`
-					+ `\n\nIn the real world, patterns can be a lot harder to fix.`
-					+ ` If you are trying to make this test pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway, a maintainer will help you.`
-					+ `\n\nFull pattern:\n${pattern}`);
-			}
+			withResultCache('2star', node, () => {
+				// The idea here is the following:
+				//
+				// We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be
+				// the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w`
+				// that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w`
+				// is accepted by `A{m}`.
+				// This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as
+				// `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main
+				// requirement for exponential backtracking.
+				//
+				// This is actually only a crude approximation for the real analysis that would have to be done. We
+				// would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most
+				// cases, the approximation is good enough.
+
+				const nfa = toNFA(node.element);
+				nfa.withoutEmptyWord();
+				const twoStar = nfa.copy();
+				twoStar.quantify(2, Infinity);
+
+				if (!isDisjointWith(nfa, twoStar)) {
+					const word = Words.pickMostReadableWord(firstOf(getIntersectionWordSets(nfa, twoStar)));
+					const example = Words.fromUnicodeToString(word);
+					assert.fail(`${path}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.`
+						+ ` This will cause exponential backtracking.`
+						+ `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.`
+						+ ` The goal is modify E such that it is disjoint with repetitions of itself.`
+						+ ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.`
+						+ `\n\nExample 1: \`(?:\\w+|::)+\``
+						+ `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.`
+						+ ` In this example, the pattern can easily be fixed by changing \`\\w+\` to \`\\w\`.`
+						+ `\nExample 2: \`(?:\\w|Foo)+\``
+						+ `\nThe problem lies in \`\\w\` and \`Foo\` because the string "Foo" can be matched as either repeating \`\\w\` 3 times or by using the \`Foo\` alternative once.`
+						+ ` In this example, the pattern can easily be fixed because the \`Foo\` alternative is redundant can can be removed.`
+						+ `\nExample 3: \`(?:\\.\\w+(?:<.*?>)?)+\``
+						+ `\nThe problem lies in \`<.*?>\`. The string ".a<>.a<>" can be matched as either \`\\. \\w < . . . . >\` or \`\\. \\w < > \\. \\w < >\`.`
+						+ ` When it comes to exponential backtracking, it doesn't matter whether a quantifier is greedy or lazy.`
+						+ ` This means that the lazy \`.*?\` can jump over \`>\`.`
+						+ ` In this example, the pattern can easily be fixed because we just have to prevent \`.*?\` jumping over \`>\`.`
+						+ ` This can done by replacing \`<.*?>\` with \`<[^\\r\\n>]*>\`.`
+						+ `\n\nIn the real world, patterns can be a lot harder to fix.`
+						+ ` If you are trying to make this test pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway, a maintainer will help you.`
+						+ `\n\nFull pattern:\n${pattern}`);
+				}
+			});
 		},
 	});