Commit 43e4e6935fd4aad92e2a2a32c063f8884562e50e

Andreas Rohner 2014-08-10T23:26:10

Allow multiple regex per token name by using arrays In most languages there are multiple different ways of declaring the same constructs (e.g. comments, strings, ...) and sometimes it is difficult or unpractical to match all of them with one single regular expression. This patch adds the possibility to use an array of pattern objects. For example there is a minor bug in the current definition of the `clike` language, that could be solved with this patch: The character immediately in front of a single line comment is highlighted as a comment as well. something// something This is because both definitions for single and multiline comments have to be matched with a single regex and the `lookbehind` parameter can only be applied to the first captured string. With this patch one could split the two definitions up and use `lookbehind` for both, thereby eliminating the bug. 'comment': [ { pattern: /(^|[^\\])\/\*[\w\W]*?\*\//g, lookbehind: true }, { pattern: /(^|[^\\:])\/\/.*?(\r?\n|$)/g, lookbehind: true } ],

diff --git a/components/prism-core.js b/components/prism-core.js
index 55658ab..cbaa817 100644
--- a/components/prism-core.js
+++ b/components/prism-core.js
@@ -214,57 +214,62 @@ var _ = self.Prism = {
 				continue;
 			}
 
-			var pattern = grammar[token],
-				inside = pattern.inside,
-				lookbehind = !!pattern.lookbehind,
-				lookbehindLength = 0;
+			var patterns = grammar[token];
+			patterns = (_.util.type(patterns) === "Array") ? patterns : [patterns];
 
-			pattern = pattern.pattern || pattern;
+			for (var j = 0; j < patterns.length; ++j) {
+				var pattern = patterns[j],
+					inside = pattern.inside,
+					lookbehind = !!pattern.lookbehind,
+					lookbehindLength = 0;
 
-			for (var i=0; i<strarr.length; i++) { // Don’t cache length as it changes during the loop
+				pattern = pattern.pattern || pattern;
 
-				var str = strarr[i];
+				for (var i=0; i<strarr.length; i++) { // Don’t cache length as it changes during the loop
 
-				if (strarr.length > text.length) {
-					// Something went terribly wrong, ABORT, ABORT!
-					break tokenloop;
-				}
+					var str = strarr[i];
 
-				if (str instanceof Token) {
-					continue;
-				}
+					if (strarr.length > text.length) {
+						// Something went terribly wrong, ABORT, ABORT!
+						break tokenloop;
+					}
 
-				pattern.lastIndex = 0;
+					if (str instanceof Token) {
+						continue;
+					}
 
-				var match = pattern.exec(str);
+					pattern.lastIndex = 0;
 
-				if (match) {
-					if(lookbehind) {
-						lookbehindLength = match[1].length;
-					}
+					var match = pattern.exec(str);
+
+					if (match) {
+						if(lookbehind) {
+							lookbehindLength = match[1].length;
+						}
 
-					var from = match.index - 1 + lookbehindLength,
-					    match = match[0].slice(lookbehindLength),
-					    len = match.length,
-					    to = from + len,
-						before = str.slice(0, from + 1),
-						after = str.slice(to + 1);
+						var from = match.index - 1 + lookbehindLength,
+							match = match[0].slice(lookbehindLength),
+							len = match.length,
+							to = from + len,
+							before = str.slice(0, from + 1),
+							after = str.slice(to + 1);
 
-					var args = [i, 1];
+						var args = [i, 1];
 
-					if (before) {
-						args.push(before);
-					}
+						if (before) {
+							args.push(before);
+						}
 
-					var wrapped = new Token(token.split('#')[0], inside? _.tokenize(match, inside) : match);
+						var wrapped = new Token(token, inside? _.tokenize(match, inside) : match);
 
-					args.push(wrapped);
+						args.push(wrapped);
 
-					if (after) {
-						args.push(after);
-					}
+						if (after) {
+							args.push(after);
+						}
 
-					Array.prototype.splice.apply(strarr, args);
+						Array.prototype.splice.apply(strarr, args);
+					}
 				}
 			}
 		}
diff --git a/extending.html b/extending.html
index 5d29ef8..273e9a3 100644
--- a/extending.html
+++ b/extending.html
@@ -54,18 +54,12 @@
 	
  	<p>Unless explicitly allowed through the <code>inside</code> property, each token cannot contain other tokens, so their order is significant. Although per the ECMAScript specification, objects are not required to have a specific ordering of their properties, in practice they do in every modern browser.</p>
  	
-	<p>In most languages there are multiple different ways of declaring the same constructs (e.g. comments, strings, ...) and sometimes it is difficult or unpractical to match all of them with one single regular expression. But because the token names are stored as key values in an object literal, it is not possible to define more than one regular expression with the same token name. To ameliorate this problem a special syntax can be used:</p>
+	<p>In most languages there are multiple different ways of declaring the same constructs (e.g. comments, strings, ...) and sometimes it is difficult or unpractical to match all of them with one single regular expression. To add multiple regular expressions for one token name an array can be used:</p>
 
 	<pre><code class="language-javascript">...
-'tokenname': /regex0/,
-'tokenname#first-description': /regex1/,
-'tokenname#second-description': {
-	pattern: /regex2/
-}
+'tokenname': [ /regex0/, /regex1/, { pattern: /regex2/ } ]
 ...</code></pre>
 
-	<p>Only the part before the <code>#</code> is used as the actual token name. The description after it is only used to distinguish the names in the language object and for documentation and readability.</p>
-
  	<section>
 	 	<h1><code>Prism.languages.insertBefore(inside, before, insert<span class="optional" title="Default value: Prism.languages">, root</span>)</code></h1>