Allow multiple regex per token name by using arrays In most languages there are multiple different ways of declaring the same constructs (e.g. comments, strings, ...) and sometimes it is difficult or unpractical to match all of them with one single regular expression. This patch adds the possibility to use an array of pattern objects. For example there is a minor bug in the current definition of the `clike` language, that could be solved with this patch: The character immediately in front of a single line comment is highlighted as a comment as well. something// something This is because both definitions for single and multiline comments have to be matched with a single regex and the `lookbehind` parameter can only be applied to the first captured string. With this patch one could split the two definitions up and use `lookbehind` for both, thereby eliminating the bug. 'comment': [ { pattern: /(^|[^\\])\/\*[\w\W]*?\*\//g, lookbehind: true }, { pattern: /(^|[^\\:])\/\/.*?(\r?\n|$)/g, lookbehind: true } ],
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
diff --git a/components/prism-core.js b/components/prism-core.js
index 55658ab..cbaa817 100644
--- a/components/prism-core.js
+++ b/components/prism-core.js
@@ -214,57 +214,62 @@ var _ = self.Prism = {
continue;
}
- var pattern = grammar[token],
- inside = pattern.inside,
- lookbehind = !!pattern.lookbehind,
- lookbehindLength = 0;
+ var patterns = grammar[token];
+ patterns = (_.util.type(patterns) === "Array") ? patterns : [patterns];
- pattern = pattern.pattern || pattern;
+ for (var j = 0; j < patterns.length; ++j) {
+ var pattern = patterns[j],
+ inside = pattern.inside,
+ lookbehind = !!pattern.lookbehind,
+ lookbehindLength = 0;
- for (var i=0; i<strarr.length; i++) { // Don’t cache length as it changes during the loop
+ pattern = pattern.pattern || pattern;
- var str = strarr[i];
+ for (var i=0; i<strarr.length; i++) { // Don’t cache length as it changes during the loop
- if (strarr.length > text.length) {
- // Something went terribly wrong, ABORT, ABORT!
- break tokenloop;
- }
+ var str = strarr[i];
- if (str instanceof Token) {
- continue;
- }
+ if (strarr.length > text.length) {
+ // Something went terribly wrong, ABORT, ABORT!
+ break tokenloop;
+ }
- pattern.lastIndex = 0;
+ if (str instanceof Token) {
+ continue;
+ }
- var match = pattern.exec(str);
+ pattern.lastIndex = 0;
- if (match) {
- if(lookbehind) {
- lookbehindLength = match[1].length;
- }
+ var match = pattern.exec(str);
+
+ if (match) {
+ if(lookbehind) {
+ lookbehindLength = match[1].length;
+ }
- var from = match.index - 1 + lookbehindLength,
- match = match[0].slice(lookbehindLength),
- len = match.length,
- to = from + len,
- before = str.slice(0, from + 1),
- after = str.slice(to + 1);
+ var from = match.index - 1 + lookbehindLength,
+ match = match[0].slice(lookbehindLength),
+ len = match.length,
+ to = from + len,
+ before = str.slice(0, from + 1),
+ after = str.slice(to + 1);
- var args = [i, 1];
+ var args = [i, 1];
- if (before) {
- args.push(before);
- }
+ if (before) {
+ args.push(before);
+ }
- var wrapped = new Token(token.split('#')[0], inside? _.tokenize(match, inside) : match);
+ var wrapped = new Token(token, inside? _.tokenize(match, inside) : match);
- args.push(wrapped);
+ args.push(wrapped);
- if (after) {
- args.push(after);
- }
+ if (after) {
+ args.push(after);
+ }
- Array.prototype.splice.apply(strarr, args);
+ Array.prototype.splice.apply(strarr, args);
+ }
}
}
}
diff --git a/extending.html b/extending.html
index 5d29ef8..273e9a3 100644
--- a/extending.html
+++ b/extending.html
@@ -54,18 +54,12 @@
<p>Unless explicitly allowed through the <code>inside</code> property, each token cannot contain other tokens, so their order is significant. Although per the ECMAScript specification, objects are not required to have a specific ordering of their properties, in practice they do in every modern browser.</p>
- <p>In most languages there are multiple different ways of declaring the same constructs (e.g. comments, strings, ...) and sometimes it is difficult or unpractical to match all of them with one single regular expression. But because the token names are stored as key values in an object literal, it is not possible to define more than one regular expression with the same token name. To ameliorate this problem a special syntax can be used:</p>
+ <p>In most languages there are multiple different ways of declaring the same constructs (e.g. comments, strings, ...) and sometimes it is difficult or unpractical to match all of them with one single regular expression. To add multiple regular expressions for one token name an array can be used:</p>
<pre><code class="language-javascript">...
-'tokenname': /regex0/,
-'tokenname#first-description': /regex1/,
-'tokenname#second-description': {
- pattern: /regex2/
-}
+'tokenname': [ /regex0/, /regex1/, { pattern: /regex2/ } ]
...</code></pre>
- <p>Only the part before the <code>#</code> is used as the actual token name. The description after it is only used to distinguish the names in the language object and for documentation and readability.</p>
-
<section>
<h1><code>Prism.languages.insertBefore(inside, before, insert<span class="optional" title="Default value: Prism.languages">, root</span>)</code></h1>