Commit 2c47354774b104c10e13f02fa3ad2319adb46b04

Lea Verou 2012-07-15T16:35:06

Fixed regexes further, documented lookbehind feature

diff --git a/components/prism-core.js b/components/prism-core.js
index 5d684a9..21134f9 100644
--- a/components/prism-core.js
+++ b/components/prism-core.js
@@ -17,7 +17,7 @@ var _ = self.Prism = {
 		var elements = document.querySelectorAll('pre.prism, pre.prism > code, code.prism');
 
 		for (var i=0, element; element = elements[i++];) {
-			if(/pre/i.test(element.nodeName) && element.children.length > 0) {
+			if (/pre/i.test(element.nodeName) && element.children.length > 0) {
 				continue;
 			}
 			
@@ -98,7 +98,7 @@ var _ = self.Prism = {
 			
 			var pattern = tokens[token], 
 				inside = pattern.inside,
-				lookbehind = pattern.lookbehind || 0;
+				lookbehind = !!pattern.lookbehind || 0;
 			
 			pattern = pattern.pattern || pattern;
 			
@@ -120,7 +120,11 @@ var _ = self.Prism = {
 				var match = pattern.exec(str);
 				
 				if (match) {
-					var from = match.index - 1 + lookbehind;
+					if(lookbehind) {
+						lookbehind = match[1].length;
+					}
+
+					var from = match.index - 1 + lookbehind,
 					    match = match[0].slice(lookbehind),
 					    len = match.length,
 					    to = from + len,
diff --git a/components/prism-javascript.js b/components/prism-javascript.js
index 1d2589a..f84b281 100644
--- a/components/prism-javascript.js
+++ b/components/prism-javascript.js
@@ -1,16 +1,16 @@
 Prism.languages.javascript = {
 	'comment': /\/\*[\w\W]*?\*\//g,
 	'regex': {
-		pattern: /[^/]\/(\\?.)+?\/[gim]{0,3}/g,
-		lookbehind: 1
+		pattern: /(^|[^/])\/(?!\/)(\[.+?]|\\.|[^/\r\n])+\/[gim]{0,3}/g,
+		lookbehind: true
 	},
 	'line-comment': /\/\/.*?(\r?\n|$)/g,
 	'string': /("|')(\\?.)*?\1/g,
 	'keyword': /\b(var|let|if|else|while|do|for|return|in|instanceof|function|new|with|typeof|try|catch|finally|null|break|continue)\b/g,
 	'boolean': /\b(true|false)\b/g,
 	'number': /\b-?(0x)?\d*\.?\d+\b/g,
-	'operator': /[-+]{1,2}|!|=?<|=?>|={1,2}|(&){1,2}|\|?\||\?|:|\*|\//g,
+	'operator': /[-+]{1,2}|!|=?<|=?>|={1,2}|(&){1,2}|\|?\||\?|\*|\//g,
 	'ignore': /&(lt|gt|amp);/gi,
-	'punctuation': /[{}[\];(),.]/g,
+	'punctuation': /[{}[\];(),.:]/g,
 	'tab': /\t/g
 };
\ No newline at end of file
diff --git a/examples.html b/examples.html
index 3a638b5..7ade7a9 100644
--- a/examples.html
+++ b/examples.html
@@ -112,6 +112,10 @@ ol {}</code></pre>
 	}
 }</code></pre>
 
+	<h2>Regexes</h2>
+	<pre class="prism"><code class="language-javascript">/([^/])\/(\\?.|\[.+?])+?\/[gim]{0,3}/g</code></pre>
+	<pre class="prism"><code class="language-javascript">/\/\*[\w\W]*?\*\//g</code></pre>
+
 	<h2>Single line comments &amp; regexes</h2>
 	<pre class="prism"><code class="language-javascript">// http://lea.verou.me
 var comment = /\/\*[\w\W]*?\*\//g;</code></pre>
@@ -119,6 +123,9 @@ var comment = /\/\*[\w\W]*?\*\//g;</code></pre>
 	<h2>Link in comment</h2>
 	<pre class="prism"><code class="language-javascript">// http://lea.verou.me
 /* http://lea.verou.me */</code></pre>
+
+	<h2>Strings with slashes</h2>
+	<pre class="prism"><code class="language-javascript">env.content + '&lt;/' + env.tag + '>'</code></pre>
 </section>
 
 <footer><div class="wrapper">
diff --git a/index.html b/index.html
index 4487176..bfcf7ea 100644
--- a/index.html
+++ b/index.html
@@ -111,13 +111,30 @@
 <section id="language-definitions">
 	<h1>Language definitions</h1>
 	
-	<p>Every language is defined as a set of tokens, which are expressed as regular expressions. For example, this is the language definition for JavaScript:</p>
-	<pre data-src="components/prism-javascript.js"></pre>
+	<p>Every language is defined as a set of tokens, which are expressed as regular expressions. For example, this is the language definition for CSS:</p>
+	<pre data-src="components/prism-css.js"></pre>
+	
+	<p>A regular expression literal is the simplest way to express a token. An alternative way, with more options, is by using an object literal. With that notation, the regular expression describing the token would be the <code>pattern</code> attribute:</p>
+	<pre class="prism"><code class="language-javascript">...
+'tokenname': {
+	pattern: /regex/
+}
+...</code></pre>
+	<p>So far the functionality is exactly the same between the short and extended notations. However, the extended notation allows for additional options:</p>
+	
+	<dl>
+		<dt>inside</dt>
+		<dd>This property accepts another object literal, with tokens that are allowed to be nested in this token. 
+		This makes it easier to define certain languages. However, keep in mind that they’re slower and if coded poorly, can even result in infinite recursion.
+		For an example of nested tokens, check out the Markup language definition:
+		<pre data-src="components/prism-markup.js"></pre></dd>
+		
+		<dt>lookbehind</dt>
+		<dd>This option mitigates JavaScript’s lack of lookbehind. When set to <code class="prism language-javascript">true</code>, the first capturing group in the regex <code>pattern</code> is discarded when matching this token, so it effectively behaves exactly as if it was lookbehind.</dd>
+	</dl>
 	
- 	<p>Each token cannot contain other tokens, so their order is significant. Although per the ECMAScript specification, objects are not required to have a specific ordering of their properties, in practice they do in every modern browser.</p>
- 	<p>In some cases, it’s easier to define a language when certain tokens can only be nested inside other tokens. This is allowed by using an object as the token value, instead of a regular expression. This object has a <code>pattern</code> property,
- 	which contains the regular expression that describes the entire token and an <code>inside</code> property that contains the tokens that can be nested inside that token. For example, the tokens for HTML are defined in that way:</p>
- 	<pre data-src="components/prism-markup.js"></pre>
+ 	<p>Unless explicitly allowed through the <code>inside</code> attribute, each token cannot contain other tokens, so their order is significant. Although per the ECMAScript specification, objects are not required to have a specific ordering of their properties, in practice they do in every modern browser.</p>
+ 	
  	
  	<p>Nested definitions can help you get around JavaScript’s lack of lookbehind assertions and usually result in simpler regular expressions. However, keep in mind that they’re slower and if coded poorly, can even result in infinite recursion.</p>
 </section>
diff --git a/prism.css b/prism.css
index 5a138b2..2cc8c46 100644
--- a/prism.css
+++ b/prism.css
@@ -70,6 +70,7 @@ code.prism {
 
 .operator {
 	color: #a67f59;
+	background: hsla(0,0%,100%,.5);
 }
 
 .atrule,
diff --git a/prism.js b/prism.js
index 39227a4..db76c93 100644
--- a/prism.js
+++ b/prism.js
@@ -17,7 +17,7 @@ var _ = self.Prism = {
 		var elements = document.querySelectorAll('pre.prism, pre.prism > code, code.prism');
 
 		for (var i=0, element; element = elements[i++];) {
-			if(/pre/i.test(element.nodeName) && element.children.length > 0) {
+			if (/pre/i.test(element.nodeName) && element.children.length > 0) {
 				continue;
 			}
 			
@@ -98,7 +98,7 @@ var _ = self.Prism = {
 			
 			var pattern = tokens[token], 
 				inside = pattern.inside,
-				lookbehind = pattern.lookbehind || 0;
+				lookbehind = !!pattern.lookbehind || 0;
 			
 			pattern = pattern.pattern || pattern;
 			
@@ -120,7 +120,11 @@ var _ = self.Prism = {
 				var match = pattern.exec(str);
 				
 				if (match) {
-					var from = match.index - 1 + lookbehind;
+					if(lookbehind) {
+						lookbehind = match[1].length;
+					}
+
+					var from = match.index - 1 + lookbehind,
 					    match = match[0].slice(lookbehind),
 					    len = match.length,
 					    to = from + len,
@@ -260,17 +264,17 @@ Prism.languages.css = {
 Prism.languages.javascript = {
 	'comment': /\/\*[\w\W]*?\*\//g,
 	'regex': {
-		pattern: /[^/]\/(\\?.)+?\/[gim]{0,3}/g,
-		lookbehind: 1
+		pattern: /(^|[^/])\/(?!\/)(\[.+?]|\\.|[^/\r\n])+\/[gim]{0,3}/g,
+		lookbehind: true
 	},
 	'line-comment': /\/\/.*?(\r?\n|$)/g,
 	'string': /("|')(\\?.)*?\1/g,
 	'keyword': /\b(var|let|if|else|while|do|for|return|in|instanceof|function|new|with|typeof|try|catch|finally|null|break|continue)\b/g,
 	'boolean': /\b(true|false)\b/g,
 	'number': /\b-?(0x)?\d*\.?\d+\b/g,
-	'operator': /[-+]{1,2}|!|=?&lt;|=?&gt;|={1,2}|(&amp;){1,2}|\|?\||\?|:|\*|\//g,
+	'operator': /[-+]{1,2}|!|=?&lt;|=?&gt;|={1,2}|(&amp;){1,2}|\|?\||\?|\*|\//g,
 	'ignore': /&(lt|gt|amp);/gi,
-	'punctuation': /[{}[\];(),.]/g,
+	'punctuation': /[{}[\];(),.:]/g,
 	'tab': /\t/g
 };