Commit 7bcc5da08fbd83ce8e35688eb6b10023742db5a7

Michael Schmidt 2021-12-18T12:53:19

OCaml: Improved tokenization (#3269)

diff --git a/components/prism-ocaml.js b/components/prism-ocaml.js
--- a/components/prism-ocaml.js
+++ b/components/prism-ocaml.js
@@ -1,23 +1,39 @@
 Prism.languages.ocaml = {
-	'comment': /\(\*[\s\S]*?\*\)/,
+	'comment': {
+		pattern: /\(\*[\s\S]*?\*\)/,
+		greedy: true
+	},
+	'char': {
+		pattern: /'(?:[^\\\r\n']|\\(?:.|[ox]?[0-9a-f]{1,3}))'/i,
+		greedy: true
+	},
 	'string': [
-			pattern: /"(?:\\.|[^\\\r\n"])*"/,
+			pattern: /"(?:\\(?:[\s\S]|\r\n)|[^\\\r\n"])*"/,
 			greedy: true
-			pattern: /(['`])(?:\\(?:\d+|x[\da-f]+|.)|(?!\1)[^\\\r\n])\1/i,
+			pattern: /\{([a-z_]*)\|[\s\S]*?\|\1\}/,
 			greedy: true
-	'number': /\b(?:0x[\da-f][\da-f_]+|(?:0[bo])?\d[\d_]*(?:\.[\d_]*)?(?:e[+-]?[\d_]+)?)/i,
+	'number': [
+		// binary and octal
+		/\b(?:0b[01][01_]*|0o[0-7][0-7_]*)\b/i,
+		// hexadecimal
+		/\b0x[a-f0-9][a-f0-9_]*(?:\.[a-f0-9_]*)?(?:p[+-]?\d[\d_]*)?(?!\w)/i,
+		// decimal
+		/\b\d[\d_]*(?:\.[\d_]*)?(?:e[+-]?\d[\d_]*)?(?!\w)/i,
+	],
 	'directive': {
 		pattern: /\B#\w+/,
-		alias: 'important'
+		alias: 'property'
 	'label': {
 		pattern: /\B~\w+/,
-		alias: 'function'
+		alias: 'property'
 	'type-variable': {
 		pattern: /\B'\w+/,
@@ -25,17 +41,18 @@ Prism.languages.ocaml = {
 	'variant': {
 		pattern: /`\w+/,
-		alias: 'variable'
-	},
-	'module': {
-		pattern: /\b[A-Z]\w+/,
-		alias: 'variable'
+		alias: 'symbol'
 	// For the list of keywords and operators,
 	// see:
 	'keyword': /\b(?:as|assert|begin|class|constraint|do|done|downto|else|end|exception|external|for|fun|function|functor|if|in|include|inherit|initializer|lazy|let|match|method|module|mutable|new|nonrec|object|of|open|private|rec|sig|struct|then|to|try|type|val|value|virtual|when|where|while|with)\b/,
 	'boolean': /\b(?:false|true)\b/,
+	'operator-like-punctuation': {
+		pattern: /\[[<>|]|[>|]\]|\{<|>\}/,
+		alias: 'punctuation'
+	},
 	// Custom operators are allowed
-	'operator': /:=|[=<>@^|&+\-*\/$%!?~][!$%&*+\-.\/:<=>?@^|~]*|\b(?:and|asr|land|lor|lsl|lsr|lxor|mod|or)\b/,
-	'punctuation': /[(){}\[\].,:;]|\b_\b/
+	'operator': /\.[.~]|:[=>]|[=<>@^|&+\-*\/$%!?~][!$%&*+\-.\/:<=>?@^|~]*|\b(?:and|asr|land|lor|lsl|lsr|lxor|mod|or)\b/,
+	'punctuation': /;;|::|[(){}\[\].,:;#]|\b_\b/
diff --git a/components/prism-ocaml.min.js b/components/prism-ocaml.min.js
--- a/components/prism-ocaml.min.js
+++ b/components/prism-ocaml.min.js
@@ -1 +1 @@
diff --git a/tests/languages/ocaml/char_feature.test b/tests/languages/ocaml/char_feature.test
new file mode 100644
--- /dev/null
+++ b/tests/languages/ocaml/char_feature.test
@@ -0,0 +1,15 @@
+	["char", "'a'"],
+	["char", "'\\n'"],
+	["char", "'\\''"],
+	["char", "'\\xA9'"],
+	["char", "'\\169'"]
diff --git a/tests/languages/ocaml/module_feature.test b/tests/languages/ocaml/module_feature.test
deleted file mode 100644
--- a/tests/languages/ocaml/module_feature.test
+++ /dev/null
@@ -1,15 +0,0 @@
-	["module", "Foo"],
-	["module", "Bar42"],
-	["module", "Baz_42"]
-Checks for modules.
diff --git a/tests/languages/ocaml/number_feature.test b/tests/languages/ocaml/number_feature.test
index 6ade211..5975858 100644
--- a/tests/languages/ocaml/number_feature.test
+++ b/tests/languages/ocaml/number_feature.test
@@ -5,9 +5,13 @@
@@ -19,11 +23,15 @@
 	["number", "0b1010_1111"],
 	["number", "42_000"],
 	["number", "3.14_15_9"],
+	["number", "3.141_592_653_589_793_12"],
+	["number", "1e-5"],
 	["number", "3.2e8"],
 	["number", "6.1E-7"],
-	["number", "0.4e+12_415"]
+	["number", "2.22044604925031308e-16"],
+	["number", "0.4e+12_415"],
+	["number", "0x1p-52"]
-Checks for numbers.
+Checks for numbers.
diff --git a/tests/languages/ocaml/operator_feature.test b/tests/languages/ocaml/operator_feature.test
index 50047fe..47ea03e 100644
--- a/tests/languages/ocaml/operator_feature.test
+++ b/tests/languages/ocaml/operator_feature.test
@@ -2,11 +2,12 @@ and asr land
 lor lsl lsr
 lxor mod or
+:= :>
 = < > @
-^ | & ~
+^ | & ~ .~
 + - * /
 $ % ! ?
@@ -18,14 +19,34 @@ $ % ! ?
 	["operator", "lxor"], ["operator", "mod"], ["operator", "or"],
 	["operator", ":="],
-	["operator", "="], ["operator", "<"], ["operator", ">"], ["operator", "@"],
-	["operator", "^"], ["operator", "|"], ["operator", "&"], ["operator", "~"],
-	["operator", "+"], ["operator", "-"], ["operator", "*"], ["operator", "/"],
-	["operator", "$"], ["operator", "%"], ["operator", "!"], ["operator", "?"],
+	["operator", ":>"],
+	["operator", "="],
+	["operator", "<"],
+	["operator", ">"],
+	["operator", "@"],
+	["operator", "^"],
+	["operator", "|"],
+	["operator", "&"],
+	["operator", "~"],
+	["operator", ".~"],
+	["operator", "+"],
+	["operator", "-"],
+	["operator", "*"],
+	["operator", "/"],
+	["operator", "$"],
+	["operator", "%"],
+	["operator", "!"],
+	["operator", "?"],
+	["operator", ".."],
 	["operator", "~=~"]
-Checks for operators.
+Checks for operators.
diff --git a/tests/languages/ocaml/punctuation_feature.test b/tests/languages/ocaml/punctuation_feature.test
index 48b2a53..8c607e0 100644
--- a/tests/languages/ocaml/punctuation_feature.test
+++ b/tests/languages/ocaml/punctuation_feature.test
@@ -1,6 +1,12 @@
 ( ) { } [ ]
 . , : ;
+:: ;;
+[< [> [| {<
+>] >} |]
@@ -17,5 +23,19 @@ _
 	["punctuation", ":"],
 	["punctuation", ";"],
-	["punctuation", "_"]
+	["punctuation", "_"],
+	["punctuation", "::"],
+	["punctuation", ";;"],
+	["operator-like-punctuation", "[<"],
+	["operator-like-punctuation", "[>"],
+	["operator-like-punctuation", "[|"],
+	["operator-like-punctuation", "{<"],
+	["operator-like-punctuation", ">]"],
+	["operator-like-punctuation", ">}"],
+	["operator-like-punctuation", "|]"],
+	["punctuation", "#"]
diff --git a/tests/languages/ocaml/string_feature.test b/tests/languages/ocaml/string_feature.test
index b7eabd4..bfaef71 100644
--- a/tests/languages/ocaml/string_feature.test
+++ b/tests/languages/ocaml/string_feature.test
@@ -1,25 +1,31 @@
+"Call me Ishmael. Some years ago — never mind how long \
+precisely — having little or no money in my purse, and \
+nothing particular to interest me on shore, I thought I\
+\ would sail about a little and see the watery part of t\
+he world."
+{|This is a quoted string, here, neither \ nor " are special characters|}
+{|"Hello, World!"|}
+{delimiter|the end of this|}quoted string is here|delimiter}
+{ext|hello {|world|}|ext}
 	["string", "\"\""],
 	["string", "\"Fo\\\"obar\""],
-	["string", "'\\''"],
-	["string", "'\\123'"],
-	["string", "'\\xf4'"],
-	["string", "`\\``"],
-	["string", "`\\123`"],
-	["string", "`\\xf4`"]
+	["string", "\"Call me Ishmael. Some years ago — never mind how long \\\r\nprecisely — having little or no money in my purse, and \\\r\nnothing particular to interest me on shore, I thought I\\\r\n\\ would sail about a little and see the watery part of t\\\r\nhe world.\""],
+	["string", "{|This is a quoted string, here, neither \\ nor \" are special characters|}"],
+	["string", "{|\"Hello, World!\"|}"],
+	["string", "{|\"\\\\\"|}"],
+	["string", "{delimiter|the end of this|}quoted string is here|delimiter}"],
+	["string", "{ext|hello {|world|}|ext}"]
-Checks for strings.
+Checks for strings.