kmx git

Commit 531514048a1dbfabe9ae983dab49bb4239589469

2021-05-01T18:01:07
Tests: Exhaustive pattern tests (#2688)
diff --git a/components/prism-erb.js b/components/prism-erb.js
index 132f4f5..989d56f 100644
--- a/components/prism-erb.js
+++ b/components/prism-erb.js
@@ -9,7 +9,7 @@
 	});
 
 	Prism.hooks.add('before-tokenize', function (env) {
-		var erbPattern = /<%=?(?:[^\r\n]|[\r\n](?!=begin)|[\r\n]=begin\s[\s\S]*?^=end)+?%>/gm;
+		var erbPattern = /<%=?(?:[^\r\n]|[\r\n](?!=begin)|[\r\n]=begin\s(?:[^\r\n]|[\r\n](?!=end))*[\r\n]=end)+?%>/gm;
 		Prism.languages['markup-templating'].buildPlaceholders(env, 'erb', erbPattern);
 	});
 
diff --git a/components/prism-erb.min.js b/components/prism-erb.min.js
index 25d3c27..a9ca5e4 100644
--- a/components/prism-erb.min.js
+++ b/components/prism-erb.min.js
@@ -1 +1 @@
-!function(n){n.languages.erb=n.languages.extend("ruby",{}),n.languages.insertBefore("erb","comment",{delimiter:{pattern:/^<%=?|%>$/,alias:"punctuation"}}),n.hooks.add("before-tokenize",function(e){n.languages["markup-templating"].buildPlaceholders(e,"erb",/<%=?(?:[^\r\n]|[\r\n](?!=begin)|[\r\n]=begin\s[\s\S]*?^=end)+?%>/gm)}),n.hooks.add("after-tokenize",function(e){n.languages["markup-templating"].tokenizePlaceholders(e,"erb")})}(Prism);
\ No newline at end of file
+!function(n){n.languages.erb=n.languages.extend("ruby",{}),n.languages.insertBefore("erb","comment",{delimiter:{pattern:/^<%=?|%>$/,alias:"punctuation"}}),n.hooks.add("before-tokenize",function(e){n.languages["markup-templating"].buildPlaceholders(e,"erb",/<%=?(?:[^\r\n]|[\r\n](?!=begin)|[\r\n]=begin\s(?:[^\r\n]|[\r\n](?!=end))*[\r\n]=end)+?%>/gm)}),n.hooks.add("after-tokenize",function(e){n.languages["markup-templating"].tokenizePlaceholders(e,"erb")})}(Prism);
\ No newline at end of file
diff --git a/tests/pattern-tests.js b/tests/pattern-tests.js
index bf1ac4f..90535e4 100644
--- a/tests/pattern-tests.js
+++ b/tests/pattern-tests.js
@@ -1,26 +1,41 @@
+// @ts-check
 'use strict';
 
 const { assert } = require('chai');
 const PrismLoader = require('./helper/prism-loader');
+const TestDiscovery = require('./helper/test-discovery');
+const TestCase = require('./helper/test-case');
 const { BFS, parseRegex } = require('./helper/util');
 const { languages } = require('../components.json');
 const { visitRegExpAST } = require('regexpp');
 const { transform, combineTransformers, JS, Words, NFA, Transformers } = require('refa');
 const scslre = require('scslre');
+const path = require('path');
 
 /**
- * A set of all safe (non-exponentially backtracking) RegExp literals (string).
+ * A map from language id to a list of code snippets in that language.
  *
- * @type {Set<string>}
+ * @type {Map<string, string[]>}
  */
-const expoSafeRegexes = new Set();
+const testSnippets = new Map();
+const testSuite = TestDiscovery.loadAllTests(__dirname + '/languages');
+for (const languageIdentifier in testSuite) {
+	const lang = TestCase.parseLanguageNames(languageIdentifier).mainLanguage;
+	let snippets = testSnippets.get(lang);
+	if (snippets === undefined) {
+		snippets = [];
+		testSnippets.set(lang, snippets);
+	}
+
+	for (const file of testSuite[languageIdentifier]) {
+		if (path.extname(file) === '.test') {
+			snippets.push(TestCase.parseTestCaseFile(file).code);
+		} else {
+			snippets.push(...Object.keys(require(file)));
+		}
+	}
+}
 
-/**
- * A set of all safe (non-polynomially backtracking) RegExp literals (string).
- *
- * @type {Set<string>}
- */
-const polySafeRegexes = new Set();
 
 for (const lang in languages) {
 	if (lang === 'meta') {
@@ -29,19 +44,9 @@ for (const lang in languages) {
 
 	describe(`Patterns of '${lang}'`, function () {
 		const Prism = PrismLoader.createInstance(lang);
-		testPatterns(Prism);
+		testPatterns(Prism, lang);
 	});
 
-	function toArray(value) {
-		if (Array.isArray(value)) {
-			return value;
-		} else if (value != null) {
-			return [value];
-		} else {
-			return [];
-		}
-	}
-
 	let optional = toArray(languages[lang].optional);
 	let modify = toArray(languages[lang].modify);
 
@@ -56,7 +61,7 @@ for (const lang in languages) {
 
 		describe(name, function () {
 			const Prism = PrismLoader.createInstance([...optional, ...modify, lang]);
-			testPatterns(Prism);
+			testPatterns(Prism, lang);
 		});
 	}
 }
@@ -65,6 +70,7 @@ for (const lang in languages) {
  * Tests all patterns in the given Prism instance.
  *
  * @param {any} Prism
+ * @param {string} mainLanguage
  *
  * @typedef {import("./helper/util").LiteralAST} LiteralAST
  * @typedef {import("regexpp/ast").CapturingGroup} CapturingGroup
@@ -73,7 +79,40 @@ for (const lang in languages) {
  * @typedef {import("regexpp/ast").LookaroundAssertion} LookaroundAssertion
  * @typedef {import("regexpp/ast").Pattern} Pattern
  */
-function testPatterns(Prism) {
+function testPatterns(Prism, mainLanguage) {
+
+	/**
+	 * Returns a list of relevant languages in the Prism instance.
+	 *
+	 * The list does not included readonly dependencies and aliases.
+	 *
+	 * @returns {string[]}
+	 */
+	function getRelevantLanguages() {
+		return [mainLanguage, ...toArray(languages[mainLanguage].modify)]
+			.filter(lang => lang in Prism.languages);
+	}
+
+	/**
+	 * @param {string} root
+	 * @param {Parameters<Parameters<typeof BFS>[1]>[0]} path
+	 * @returns {string}
+	 */
+	function BFSPathToString(root, path) {
+		let pathStr = root;
+		for (const { key } of path) {
+			if (!key) {
+				// do nothing
+			} else if (/^\d+$/.test(key)) {
+				pathStr += `[${key}]`;
+			} else if (/^[a-z]\w*$/i.test(key)) {
+				pathStr += `.${key}`;
+			} else {
+				pathStr += `[${JSON.stringify(key)}]`;
+			}
+		}
+		return pathStr;
+	}
 
 	/**
 	 * Invokes the given function on every pattern in `Prism.languages`.
@@ -94,49 +133,73 @@ function testPatterns(Prism) {
 	 * @property {(message: string) => void} reportError
 	 */
 	function forEachPattern(callback) {
+		const visited = new Set();
 		const errors = [];
 
-		BFS(Prism.languages, path => {
-			const { key, value } = path[path.length - 1];
-
-			let tokenPath = 'Prism.languages';
-			for (const { key } of path) {
-				if (!key) {
-					// do nothing
-				} else if (/^\d+$/.test(key)) {
-					tokenPath += `[${key}]`;
-				} else if (/^[a-z]\w*$/i.test(key)) {
-					tokenPath += `.${key}`;
-				} else {
-					tokenPath += `[${JSON.stringify(key)}]`;
-				}
+		/**
+		 * @param {object} root
+		 * @param {string} rootStr
+		 */
+		function traverse(root, rootStr) {
+			if (visited.has(root)) {
+				return;
 			}
+			visited.add(root);
 
-			if (Object.prototype.toString.call(value) == '[object RegExp]') {
-				try {
-					let ast;
+			BFS(root, path => {
+				const { key, value } = path[path.length - 1];
+				visited.add(value);
+
+				const tokenPath = BFSPathToString(rootStr, path);
+
+				if (Object.prototype.toString.call(value) == '[object RegExp]') {
 					try {
-						ast = parseRegex(value);
+						let ast;
+						try {
+							ast = parseRegex(value);
+						} catch (error) {
+							throw new SyntaxError(`Invalid RegExp at ${tokenPath}\n\n${error.message}`);
+						}
+
+						const parent = path.length > 1 ? path[path.length - 2].value : undefined;
+						callback({
+							pattern: value,
+							ast,
+							tokenPath,
+							name: key,
+							parent,
+							path,
+							lookbehind: key === 'pattern' && parent && !!parent.lookbehind,
+							reportError: message => errors.push(message)
+						});
 					} catch (error) {
-						throw new SyntaxError(`Invalid RegExp at ${tokenPath}\n\n${error.message}`);
+						errors.push(error);
 					}
-
-					const parent = path.length > 1 ? path[path.length - 2].value : undefined;
-					callback({
-						pattern: value,
-						ast,
-						tokenPath,
-						name: key,
-						parent,
-						path,
-						lookbehind: key === 'pattern' && parent && !!parent.lookbehind,
-						reportError: message => errors.push(message)
-					});
-				} catch (error) {
-					errors.push(error);
 				}
+			});
+		}
+
+		// static analysis
+		traverse(Prism.languages, 'Prism.languages');
+
+		// dynamic analysis
+		for (const lang of getRelevantLanguages()) {
+			const snippets = testSnippets.get(lang);
+			const grammar = Prism.languages[lang];
+
+			const oldTokenize = Prism.tokenize;
+			Prism.tokenize = function (_, grammar) {
+				const result = oldTokenize.apply(this, arguments);
+				traverse(grammar, lang + ': <Unknown>');
+				return result;
+			};
+
+			for (const snippet of (snippets || [])) {
+				Prism.highlight(snippet, grammar, lang);
 			}
-		});
+
+			Prism.tokenize = oldTokenize;
+		}
 
 		if (errors.length > 0) {
 			throw new Error(errors.map(e => String(e.message || e)).join('\n\n'));
@@ -165,91 +228,6 @@ function testPatterns(Prism) {
 		});
 	}
 
-	/**
-	 * Returns whether the given element will always have zero width meaning that it doesn't consume characters.
-	 *
-	 * @param {Element} element
-	 * @returns {boolean}
-	 */
-	function isAlwaysZeroWidth(element) {
-		switch (element.type) {
-			case 'Assertion':
-				// assertions == ^, $, \b, lookarounds
-				return true;
-			case 'Quantifier':
-				return element.max === 0 || isAlwaysZeroWidth(element.element);
-			case 'CapturingGroup':
-			case 'Group':
-				// every element in every alternative has to be of zero length
-				return element.alternatives.every(alt => alt.elements.every(isAlwaysZeroWidth));
-			case 'Backreference':
-				// on if the group referred to is of zero length
-				return isAlwaysZeroWidth(element.resolved);
-			default:
-				return false; // what's left are characters
-		}
-	}
-
-	/**
-	 * Returns whether the given element will always at the start of the whole match.
-	 *
-	 * @param {Element} element
-	 * @returns {boolean}
-	 */
-	function isFirstMatch(element) {
-		const parent = element.parent;
-		switch (parent.type) {
-			case 'Alternative': {
-				// all elements before this element have to of zero length
-				if (!parent.elements.slice(0, parent.elements.indexOf(element)).every(isAlwaysZeroWidth)) {
-					return false;
-				}
-				const grandParent = parent.parent;
-				if (grandParent.type === 'Pattern') {
-					return true;
-				} else {
-					return isFirstMatch(grandParent);
-				}
-			}
-
-			case 'Quantifier':
-				if (parent.max >= 2) {
-					return false;
-				} else {
-					return isFirstMatch(parent);
-				}
-
-			default:
-				throw new Error(`Internal error: The given node should not be a '${element.type}'.`);
-		}
-	}
-
-	/**
-	 * Returns whether the given node either is or is a child of what is effectively a Kleene star.
-	 *
-	 * @param {import("regexpp/ast").Node} node
-	 * @returns {boolean}
-	 */
-	function underAStar(node) {
-		if (node.type === 'Quantifier' && node.max > 10) {
-			return true;
-		} else if (node.parent) {
-			return underAStar(node.parent);
-		} else {
-			return false;
-		}
-	}
-
-	/**
-	 * @param {Iterable<T>} iter
-	 * @returns {T | undefined}
-	 * @template T
-	 */
-	function firstOf(iter) {
-		const [first] = iter;
-		return first;
-	}
-
 
 	it('- should not match the empty string', function () {
 		forEachPattern(({ pattern, tokenPath }) => {
@@ -384,221 +362,370 @@ function testPatterns(Prism) {
 	});
 
 	it('- should not cause exponential backtracking', function () {
-		/** @type {Transformers.CreationOptions} */
-		const options = {
-			ignoreOrder: true,
-			ignoreAmbiguity: true
-		};
-		const transformer = combineTransformers([
-			Transformers.inline(options),
-			Transformers.removeDeadBranches(options),
-			Transformers.unionCharacters(options),
-			Transformers.moveUpEmpty(options),
-			Transformers.nestedQuantifiers(options),
-			Transformers.sortAssertions(options),
-			Transformers.removeUnnecessaryAssertions(options),
-			Transformers.applyAssertions(options),
-		]);
-
-		forEachPattern(({ pattern, ast, tokenPath }) => {
-			const patternStr = String(pattern);
-			if (expoSafeRegexes.has(patternStr)) {
-				// we know that the pattern won't cause exp backtracking because we checked before
-				return;
-			}
+		replaceRegExpProto(exec => {
+			return function (input) {
+				checkExponentialBacktracking('<Unknown>', this);
+				return exec.call(this, input);
+			};
+		}, () => {
+			forEachPattern(({ pattern, ast, tokenPath }) => {
+				checkExponentialBacktracking(tokenPath, pattern, ast);
+			});
+		});
+	});
 
-			const parser = JS.Parser.fromAst(ast);
-			/**
-			 * Parses the given element and creates its NFA.
-			 *
-			 * @param {import("refa").JS.ParsableElement} element
-			 * @returns {NFA}
-			 */
-			function toNFA(element) {
-				let { expression, maxCharacter } = parser.parseElement(element, {
-					maxBackreferenceWords: 1000,
-					backreferences: 'disable'
-				});
+	it('- should not cause polynomial backtracking', function () {
+		replaceRegExpProto(exec => {
+			return function (input) {
+				checkPolynomialBacktracking('<Unknown>', this);
+				return exec.call(this, input);
+			};
+		}, () => {
+			forEachPattern(({ pattern, ast, tokenPath }) => {
+				checkPolynomialBacktracking(tokenPath, pattern, ast);
+			});
+		});
+	});
 
-				// try to remove assertions
-				expression = transform(transformer, expression);
+}
 
-				return NFA.fromRegex(expression, { maxCharacter }, { assertions: 'disable' });
-			}
 
-			/**
-			 * Checks whether the alternatives of the given node are disjoint. If the alternatives are not disjoint
-			 * and the give node is a descendant of an effective Kleene star, then an error will be thrown.
-			 *
-			 * @param {CapturingGroup | Group | LookaroundAssertion} node
-			 * @returns {void}
-			 */
-			function checkDisjointAlternatives(node) {
-				if (!underAStar(node) || node.alternatives.length < 2) {
-					return;
-				}
+/**
+ * Returns whether the given element will always have zero width meaning that it doesn't consume characters.
+ *
+ * @param {Element} element
+ * @returns {boolean}
+ */
+function isAlwaysZeroWidth(element) {
+	switch (element.type) {
+		case 'Assertion':
+			// assertions == ^, $, \b, lookarounds
+			return true;
+		case 'Quantifier':
+			return element.max === 0 || isAlwaysZeroWidth(element.element);
+		case 'CapturingGroup':
+		case 'Group':
+			// every element in every alternative has to be of zero length
+			return element.alternatives.every(alt => alt.elements.every(isAlwaysZeroWidth));
+		case 'Backreference':
+			// on if the group referred to is of zero length
+			return isAlwaysZeroWidth(element.resolved);
+		default:
+			return false; // what's left are characters
+	}
+}
 
-				const alternatives = node.alternatives;
-
-				const total = toNFA(alternatives[0]);
-				total.withoutEmptyWord();
-				for (let i = 1, l = alternatives.length; i < l; i++) {
-					const a = alternatives[i];
-					const current = toNFA(a);
-					current.withoutEmptyWord();
-
-					if (!total.isDisjointWith(current)) {
-						assert.fail(`${tokenPath}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.`
-							+ ` This will cause exponential backtracking.`
-							+ `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.`
-							+ ` The goal is that all of its alternatives are disjoint.`
-							+ ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.`
-							+ `\n\nExample: \`(?:[ab]|\\w|::)+\``
-							+ `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.`
-							+ ` In this example, the pattern can easily be fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.`
-							+ `\nIn the real world, patterns can be a lot harder to fix.`
-							+ ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.`
-							+ ` A maintainer will help you.`
-							+ `\n\nFull pattern:\n${pattern}`);
-					} else if (i !== l - 1) {
-						total.union(current);
-					}
-				}
+/**
+ * Returns whether the given element will always at the start of the whole match.
+ *
+ * @param {Element} element
+ * @returns {boolean}
+ */
+function isFirstMatch(element) {
+	const parent = element.parent;
+	switch (parent.type) {
+		case 'Alternative': {
+			// all elements before this element have to of zero length
+			if (!parent.elements.slice(0, parent.elements.indexOf(element)).every(isAlwaysZeroWidth)) {
+				return false;
 			}
+			const grandParent = parent.parent;
+			if (grandParent.type === 'Pattern') {
+				return true;
+			} else {
+				return isFirstMatch(grandParent);
+			}
+		}
 
-			visitRegExpAST(ast.pattern, {
-				onCapturingGroupLeave: checkDisjointAlternatives,
-				onGroupLeave: checkDisjointAlternatives,
-				onAssertionLeave(node) {
-					if (node.kind === 'lookahead' || node.kind === 'lookbehind') {
-						checkDisjointAlternatives(node);
-					}
-				},
+		case 'Quantifier':
+			if (parent.max >= 2) {
+				return false;
+			} else {
+				return isFirstMatch(parent);
+			}
 
-				onQuantifierLeave(node) {
-					if (node.max < 10) {
-						return; // not a star
-					}
-					if (node.element.type !== 'CapturingGroup' && node.element.type !== 'Group') {
-						return; // not a group
-					}
+		default:
+			throw new Error(`Internal error: The given node should not be a '${element.type}'.`);
+	}
+}
 
-					// The idea here is the following:
-					//
-					// We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be
-					// the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w`
-					// that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w`
-					// is accepted by `A{m}`.
-					// This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as
-					// `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main
-					// requirement for exponential backtracking.
-					//
-					// This is actually only a crude approximation for the real analysis that would have to be done. We
-					// would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most
-					// cases, the approximation is good enough.
-
-					const nfa = toNFA(node.element);
-					nfa.withoutEmptyWord();
-					const twoStar = nfa.copy();
-					twoStar.quantify(2, Infinity);
-
-					if (!nfa.isDisjointWith(twoStar)) {
-						const word = Words.pickMostReadableWord(firstOf(nfa.intersectionWordSets(twoStar)));
-						const example = Words.fromUnicodeToString(word);
-						assert.fail(`${tokenPath}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.`
-							+ ` This will cause exponential backtracking.`
-							+ `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.`
-							+ ` The goal is modify E such that it is disjoint with repetitions of itself.`
-							+ ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.`
-							+ `\n\nExample 1: \`(?:\\w+|::)+\``
-							+ `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.`
-							+ ` In this example, the pattern can easily be fixed by changing \`\\w+\` to \`\\w\`.`
-							+ `\nExample 2: \`(?:\\w|Foo)+\``
-							+ `\nThe problem lies in \`\\w\` and \`Foo\` because the string "Foo" can be matched as either repeating \`\\w\` 3 times or by using the \`Foo\` alternative once.`
-							+ ` In this example, the pattern can easily be fixed because the \`Foo\` alternative is redundant can can be removed.`
-							+ `\nExample 3: \`(?:\\.\\w+(?:<.*?>)?)+\``
-							+ `\nThe problem lies in \`<.*?>\`. The string ".a<>.a<>" can be matched as either \`\\. \\w < . . . . >\` or \`\\. \\w < > \\. \\w < >\`.`
-							+ ` When it comes to exponential backtracking, it doesn't matter whether a quantifier is greedy or lazy.`
-							+ ` This means that the lazy \`.*?\` can jump over \`>\`.`
-							+ ` In this example, the pattern can easily be fixed because we just have to prevent \`.*?\` jumping over \`>\`.`
-							+ ` This can done by replacing \`<.*?>\` with \`<[^\\r\\n>]*>\`.`
-							+ `\n\nIn the real world, patterns can be a lot harder to fix.`
-							+ ` If you are trying to make this test pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway, a maintainer will help you.`
-							+ `\n\nFull pattern:\n${pattern}`);
-					}
-				},
-			});
+/**
+ * Returns whether the given node either is or is a child of what is effectively a Kleene star.
+ *
+ * @param {import("regexpp/ast").Node} node
+ * @returns {boolean}
+ */
+function underAStar(node) {
+	if (node.type === 'Quantifier' && node.max > 10) {
+		return true;
+	} else if (node.parent) {
+		return underAStar(node.parent);
+	} else {
+		return false;
+	}
+}
+
+/**
+ * @param {Iterable<T>} iter
+ * @returns {T | undefined}
+ * @template T
+ */
+function firstOf(iter) {
+	const [first] = iter;
+	return first;
+}
+
+/**
+ * A set of all safe (non-exponentially backtracking) RegExp literals (string).
+ *
+ * @type {Set<string | RegExp>}
+ */
+const expoSafeRegexes = new Set();
 
-			expoSafeRegexes.add(patternStr);
+/** @type {Transformers.CreationOptions} */
+const options = {
+	ignoreOrder: true,
+	ignoreAmbiguity: true
+};
+const transformer = combineTransformers([
+	Transformers.inline(options),
+	Transformers.removeDeadBranches(options),
+	Transformers.unionCharacters(options),
+	Transformers.moveUpEmpty(options),
+	Transformers.nestedQuantifiers(options),
+	Transformers.sortAssertions(options),
+	Transformers.removeUnnecessaryAssertions(options),
+	Transformers.applyAssertions(options),
+]);
+
+
+/**
+ * @param {string} path
+ * @param {RegExp} pattern
+ * @param {LiteralAST} [ast]
+ * @returns {void}
+ */
+function checkExponentialBacktracking(path, pattern, ast) {
+	if (expoSafeRegexes.has(pattern)) {
+		// we know that the pattern won't cause exp backtracking because we checked before
+		return;
+	}
+	const patternStr = String(pattern);
+	if (expoSafeRegexes.has(patternStr)) {
+		// we know that the pattern won't cause exp backtracking because we checked before
+		return;
+	}
+
+	if (!ast) {
+		ast = parseRegex(pattern);
+	}
+
+	const parser = JS.Parser.fromAst(ast);
+	/**
+	 * Parses the given element and creates its NFA.
+	 *
+	 * @param {import("refa").JS.ParsableElement} element
+	 * @returns {NFA}
+	 */
+	function toNFA(element) {
+		let { expression, maxCharacter } = parser.parseElement(element, {
+			maxBackreferenceWords: 1000,
+			backreferences: 'disable'
 		});
-	});
 
-	it('- should not cause polynomial backtracking', function () {
-		forEachPattern(({ pattern, ast, tokenPath }) => {
-			const patternStr = String(pattern);
-			if (polySafeRegexes.has(patternStr)) {
-				// we know that the pattern won't cause poly backtracking because we checked before
-				return;
+		return NFA.fromRegex(transform(transformer, expression), { maxCharacter }, { assertions: 'disable' });
+	}
+
+	/**
+	 * Checks whether the alternatives of the given node are disjoint. If the alternatives are not disjoint
+	 * and the give node is a descendant of an effective Kleene star, then an error will be thrown.
+	 *
+	 * @param {CapturingGroup | Group | LookaroundAssertion} node
+	 * @returns {void}
+	 */
+	function checkDisjointAlternatives(node) {
+		if (!underAStar(node) || node.alternatives.length < 2) {
+			return;
+		}
+
+		const alternatives = node.alternatives;
+
+		const total = toNFA(alternatives[0]);
+		total.withoutEmptyWord();
+		for (let i = 1, l = alternatives.length; i < l; i++) {
+			const a = alternatives[i];
+			const current = toNFA(a);
+			current.withoutEmptyWord();
+
+			if (!total.isDisjointWith(current)) {
+				assert.fail(`${path}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.`
+					+ ` This will cause exponential backtracking.`
+					+ `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.`
+					+ ` The goal is that all of its alternatives are disjoint.`
+					+ ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.`
+					+ `\n\nExample: \`(?:[ab]|\\w|::)+\``
+					+ `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.`
+					+ ` In this example, the pattern can easily be fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.`
+					+ `\nIn the real world, patterns can be a lot harder to fix.`
+					+ ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.`
+					+ ` A maintainer will help you.`
+					+ `\n\nFull pattern:\n${pattern}`);
+			} else if (i !== l - 1) {
+				total.union(current);
 			}
+		}
+	}
 
-			const result = scslre.analyse(ast, { maxReports: 1, reportTypes: { 'Move': false } });
-			if (result.reports.length > 0) {
-				const report = result.reports[0];
-
-				let rangeOffset;
-				let rangeStr;
-				let rangeHighlight;
-
-				switch (report.type) {
-					case 'Trade': {
-						const start = Math.min(report.startQuant.start, report.endQuant.start);
-						const end = Math.max(report.startQuant.end, report.endQuant.end);
-						rangeOffset = start + 1;
-						rangeStr = patternStr.substring(start + 1, end + 1);
-						rangeHighlight = highlight([
-							{ ...report.startQuant, label: 'start' },
-							{ ...report.endQuant, label: 'end' }
-						], -start);
-						break;
-					}
-					case 'Self': {
-						rangeOffset = report.parentQuant.start + 1;
-						rangeStr = patternStr.substring(report.parentQuant.start + 1, report.parentQuant.end + 1);
-						rangeHighlight = highlight([{ ...report.quant, label: 'self' }], -report.parentQuant.start);
-						break;
-					}
-					case 'Move': {
-						rangeOffset = 1;
-						rangeStr = patternStr.substring(1, report.quant.end + 1);
-						rangeHighlight = highlight([report.quant]);
-						break;
-					}
-					default:
-						throw new Error('Invalid report type "' + report.type + '". This should never happen.');
-				}
+	visitRegExpAST(ast.pattern, {
+		onCapturingGroupLeave: checkDisjointAlternatives,
+		onGroupLeave: checkDisjointAlternatives,
+		onAssertionLeave(node) {
+			if (node.kind === 'lookahead' || node.kind === 'lookbehind') {
+				checkDisjointAlternatives(node);
+			}
+		},
 
-				const attackChar = `/${report.character.literal.source}/${report.character.literal.flags}`;
-				const fixed = report.fix();
-
-				assert.fail(
-					`${tokenPath}: ${report.exponential ? 'Exponential' : 'Polynomial'} backtracking. `
-					+ `By repeating any character that matches ${attackChar}, an attack string can be created.`
-					+ `\n`
-					+ `\n${indent(rangeStr)}`
-					+ `\n${indent(rangeHighlight)}`
-					+ `\n`
-					+ `\nFull pattern:`
-					+ `\n${patternStr}`
-					+ `\n${indent(rangeHighlight, ' '.repeat(rangeOffset))}`
-					+ `\n`
-					+ `\n` + (fixed ? `Fixed:\n/${fixed.source}/${fixed.flags}` : `Fix not available.`)
-				);
+		onQuantifierLeave(node) {
+			if (node.max < 10) {
+				return; // not a star
+			}
+			if (node.element.type !== 'CapturingGroup' && node.element.type !== 'Group') {
+				return; // not a group
 			}
 
-			polySafeRegexes.add(patternStr);
-		});
+			// The idea here is the following:
+			//
+			// We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be
+			// the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w`
+			// that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w`
+			// is accepted by `A{m}`.
+			// This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as
+			// `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main
+			// requirement for exponential backtracking.
+			//
+			// This is actually only a crude approximation for the real analysis that would have to be done. We
+			// would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most
+			// cases, the approximation is good enough.
+
+			const nfa = toNFA(node.element);
+			nfa.withoutEmptyWord();
+			const twoStar = nfa.copy();
+			twoStar.quantify(2, Infinity);
+
+			if (!nfa.isDisjointWith(twoStar)) {
+				const word = Words.pickMostReadableWord(firstOf(nfa.intersectionWordSets(twoStar)));
+				const example = Words.fromUnicodeToString(word);
+				assert.fail(`${path}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.`
+					+ ` This will cause exponential backtracking.`
+					+ `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.`
+					+ ` The goal is modify E such that it is disjoint with repetitions of itself.`
+					+ ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.`
+					+ `\n\nExample 1: \`(?:\\w+|::)+\``
+					+ `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.`
+					+ ` In this example, the pattern can easily be fixed by changing \`\\w+\` to \`\\w\`.`
+					+ `\nExample 2: \`(?:\\w|Foo)+\``
+					+ `\nThe problem lies in \`\\w\` and \`Foo\` because the string "Foo" can be matched as either repeating \`\\w\` 3 times or by using the \`Foo\` alternative once.`
+					+ ` In this example, the pattern can easily be fixed because the \`Foo\` alternative is redundant can can be removed.`
+					+ `\nExample 3: \`(?:\\.\\w+(?:<.*?>)?)+\``
+					+ `\nThe problem lies in \`<.*?>\`. The string ".a<>.a<>" can be matched as either \`\\. \\w < . . . . >\` or \`\\. \\w < > \\. \\w < >\`.`
+					+ ` When it comes to exponential backtracking, it doesn't matter whether a quantifier is greedy or lazy.`
+					+ ` This means that the lazy \`.*?\` can jump over \`>\`.`
+					+ ` In this example, the pattern can easily be fixed because we just have to prevent \`.*?\` jumping over \`>\`.`
+					+ ` This can done by replacing \`<.*?>\` with \`<[^\\r\\n>]*>\`.`
+					+ `\n\nIn the real world, patterns can be a lot harder to fix.`
+					+ ` If you are trying to make this test pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway, a maintainer will help you.`
+					+ `\n\nFull pattern:\n${pattern}`);
+			}
+		},
 	});
 
+	expoSafeRegexes.add(pattern);
+	expoSafeRegexes.add(patternStr);
+}
+
+/**
+ * A set of all safe (non-polynomially backtracking) RegExp literals (string).
+ *
+ * @type {Set<string | RegExp>}
+ */
+const polySafeRegexes = new Set();
+/**
+ * @param {string} path
+ * @param {RegExp} pattern
+ * @param {LiteralAST} [ast]
+ * @returns {void}
+ */
+function checkPolynomialBacktracking(path, pattern, ast) {
+	if (polySafeRegexes.has(pattern)) {
+		// we know that the pattern won't cause poly backtracking because we checked before
+		return;
+	}
+	const patternStr = String(pattern);
+	if (polySafeRegexes.has(patternStr)) {
+		// we know that the pattern won't cause poly backtracking because we checked before
+		return;
+	}
+
+	if (!ast) {
+		ast = parseRegex(pattern);
+	}
+
+	const result = scslre.analyse(ast, { maxReports: 1, reportTypes: { 'Move': false } });
+	if (result.reports.length > 0) {
+		const report = result.reports[0];
+
+		let rangeOffset;
+		let rangeStr;
+		let rangeHighlight;
+
+		switch (report.type) {
+			case 'Trade': {
+				const start = Math.min(report.startQuant.start, report.endQuant.start);
+				const end = Math.max(report.startQuant.end, report.endQuant.end);
+				rangeOffset = start + 1;
+				rangeStr = patternStr.substring(start + 1, end + 1);
+				rangeHighlight = highlight([
+					{ ...report.startQuant, label: 'start' },
+					{ ...report.endQuant, label: 'end' }
+				], -start);
+				break;
+			}
+			case 'Self': {
+				rangeOffset = report.parentQuant.start + 1;
+				rangeStr = patternStr.substring(report.parentQuant.start + 1, report.parentQuant.end + 1);
+				rangeHighlight = highlight([{ ...report.quant, label: 'self' }], -report.parentQuant.start);
+				break;
+			}
+			case 'Move': {
+				rangeOffset = 1;
+				rangeStr = patternStr.substring(1, report.quant.end + 1);
+				rangeHighlight = highlight([report.quant]);
+				break;
+			}
+			default:
+				throw new Error('Invalid report type. This should never happen.');
+		}
+
+		const attackChar = `/${report.character.literal.source}/${report.character.literal.flags}`;
+		const fixed = report.fix();
+
+		assert.fail(
+			`${path}: ${report.exponential ? 'Exponential' : 'Polynomial'} backtracking. `
+			+ `By repeating any character that matches ${attackChar}, an attack string can be created.`
+			+ `\n`
+			+ `\n${indent(rangeStr)}`
+			+ `\n${indent(rangeHighlight)}`
+			+ `\n`
+			+ `\nFull pattern:`
+			+ `\n${patternStr}`
+			+ `\n${indent(rangeHighlight, ' '.repeat(rangeOffset))}`
+			+ `\n`
+			+ `\n` + (fixed ? `Fixed:\n/${fixed.source}/${fixed.flags}` : `Fix not available.`)
+		);
+	}
+
+	polySafeRegexes.add(pattern);
+	polySafeRegexes.add(patternStr);
 }
 
 /**
@@ -609,6 +736,7 @@ function testPatterns(Prism) {
  * @typedef Highlight
  * @property {number} start
  * @property {number} end
+ * @property {string} [label]
  */
 function highlight(highlights, offset = 0) {
 	highlights.sort((a, b) => a.start - b.start);
@@ -646,3 +774,47 @@ function highlight(highlights, offset = 0) {
 function indent(str, amount = '    ') {
 	return str.split(/\r?\n/g).map(m => m === '' ? '' : amount + m).join('\n');
 }
+
+/**
+ * @param {(exec: RegExp["exec"]) => RegExp["exec"]} execSupplier
+ * @param {() => void} fn
+ */
+function replaceRegExpProto(execSupplier, fn) {
+	const oldExec = RegExp.prototype.exec;
+	const oldTest = RegExp.prototype.test;
+	const newExec = execSupplier(oldExec);
+
+	RegExp.prototype.exec = newExec;
+	RegExp.prototype.test = function (input) {
+		return newExec.call(this, input) !== null;
+	};
+
+	let error;
+	try {
+		fn();
+	} catch (e) {
+		error = e;
+	}
+
+	RegExp.prototype.exec = oldExec;
+	RegExp.prototype.test = oldTest;
+
+	if (error) {
+		throw error;
+	}
+}
+
+/**
+ * @param {undefined | null | T | T[]} value
+ * @returns {T[]}
+ * @template T
+ */
+function toArray(value) {
+	if (Array.isArray(value)) {
+		return value;
+	} else if (value != null) {
+		return [value];
+	} else {
+		return [];
+	}
+}
kmx.io/prism.js

Commit 531514048a1dbfabe9ae983dab49bb4239589469