Commit 7bcc5da08fbd83ce8e35688eb6b10023742db5a7

Michael Schmidt 2021-12-18T12:53:19

OCaml: Improved tokenization (#3269)

diff --git a/components/prism-ocaml.js b/components/prism-ocaml.js
index d9b06a3..2fac3b0 100644
--- a/components/prism-ocaml.js
+++ b/components/prism-ocaml.js
@@ -1,23 +1,39 @@
+// https://ocaml.org/manual/lex.html
+
 Prism.languages.ocaml = {
-	'comment': /\(\*[\s\S]*?\*\)/,
+	'comment': {
+		pattern: /\(\*[\s\S]*?\*\)/,
+		greedy: true
+	},
+	'char': {
+		pattern: /'(?:[^\\\r\n']|\\(?:.|[ox]?[0-9a-f]{1,3}))'/i,
+		greedy: true
+	},
 	'string': [
 		{
-			pattern: /"(?:\\.|[^\\\r\n"])*"/,
+			pattern: /"(?:\\(?:[\s\S]|\r\n)|[^\\\r\n"])*"/,
 			greedy: true
 		},
 		{
-			pattern: /(['`])(?:\\(?:\d+|x[\da-f]+|.)|(?!\1)[^\\\r\n])\1/i,
+			pattern: /\{([a-z_]*)\|[\s\S]*?\|\1\}/,
 			greedy: true
 		}
 	],
-	'number': /\b(?:0x[\da-f][\da-f_]+|(?:0[bo])?\d[\d_]*(?:\.[\d_]*)?(?:e[+-]?[\d_]+)?)/i,
+	'number': [
+		// binary and octal
+		/\b(?:0b[01][01_]*|0o[0-7][0-7_]*)\b/i,
+		// hexadecimal
+		/\b0x[a-f0-9][a-f0-9_]*(?:\.[a-f0-9_]*)?(?:p[+-]?\d[\d_]*)?(?!\w)/i,
+		// decimal
+		/\b\d[\d_]*(?:\.[\d_]*)?(?:e[+-]?\d[\d_]*)?(?!\w)/i,
+	],
 	'directive': {
 		pattern: /\B#\w+/,
-		alias: 'important'
+		alias: 'property'
 	},
 	'label': {
 		pattern: /\B~\w+/,
-		alias: 'function'
+		alias: 'property'
 	},
 	'type-variable': {
 		pattern: /\B'\w+/,
@@ -25,17 +41,18 @@ Prism.languages.ocaml = {
 	},
 	'variant': {
 		pattern: /`\w+/,
-		alias: 'variable'
-	},
-	'module': {
-		pattern: /\b[A-Z]\w+/,
-		alias: 'variable'
+		alias: 'symbol'
 	},
 	// For the list of keywords and operators,
 	// see: http://caml.inria.fr/pub/docs/manual-ocaml/lex.html#sec84
 	'keyword': /\b(?:as|assert|begin|class|constraint|do|done|downto|else|end|exception|external|for|fun|function|functor|if|in|include|inherit|initializer|lazy|let|match|method|module|mutable|new|nonrec|object|of|open|private|rec|sig|struct|then|to|try|type|val|value|virtual|when|where|while|with)\b/,
 	'boolean': /\b(?:false|true)\b/,
+
+	'operator-like-punctuation': {
+		pattern: /\[[<>|]|[>|]\]|\{<|>\}/,
+		alias: 'punctuation'
+	},
 	// Custom operators are allowed
-	'operator': /:=|[=<>@^|&+\-*\/$%!?~][!$%&*+\-.\/:<=>?@^|~]*|\b(?:and|asr|land|lor|lsl|lsr|lxor|mod|or)\b/,
-	'punctuation': /[(){}\[\].,:;]|\b_\b/
+	'operator': /\.[.~]|:[=>]|[=<>@^|&+\-*\/$%!?~][!$%&*+\-.\/:<=>?@^|~]*|\b(?:and|asr|land|lor|lsl|lsr|lxor|mod|or)\b/,
+	'punctuation': /;;|::|[(){}\[\].,:;#]|\b_\b/
 };
diff --git a/components/prism-ocaml.min.js b/components/prism-ocaml.min.js
index 1b7d7bc..c7b21ad 100644
--- a/components/prism-ocaml.min.js
+++ b/components/prism-ocaml.min.js
@@ -1 +1 @@
-Prism.languages.ocaml={comment:/\(\*[\s\S]*?\*\)/,string:[{pattern:/"(?:\\.|[^\\\r\n"])*"/,greedy:!0},{pattern:/(['`])(?:\\(?:\d+|x[\da-f]+|.)|(?!\1)[^\\\r\n])\1/i,greedy:!0}],number:/\b(?:0x[\da-f][\da-f_]+|(?:0[bo])?\d[\d_]*(?:\.[\d_]*)?(?:e[+-]?[\d_]+)?)/i,directive:{pattern:/\B#\w+/,alias:"important"},label:{pattern:/\B~\w+/,alias:"function"},"type-variable":{pattern:/\B'\w+/,alias:"function"},variant:{pattern:/`\w+/,alias:"variable"},module:{pattern:/\b[A-Z]\w+/,alias:"variable"},keyword:/\b(?:as|assert|begin|class|constraint|do|done|downto|else|end|exception|external|for|fun|function|functor|if|in|include|inherit|initializer|lazy|let|match|method|module|mutable|new|nonrec|object|of|open|private|rec|sig|struct|then|to|try|type|val|value|virtual|when|where|while|with)\b/,boolean:/\b(?:false|true)\b/,operator:/:=|[=<>@^|&+\-*\/$%!?~][!$%&*+\-.\/:<=>?@^|~]*|\b(?:and|asr|land|lor|lsl|lsr|lxor|mod|or)\b/,punctuation:/[(){}\[\].,:;]|\b_\b/};
\ No newline at end of file
+Prism.languages.ocaml={comment:{pattern:/\(\*[\s\S]*?\*\)/,greedy:!0},char:{pattern:/'(?:[^\\\r\n']|\\(?:.|[ox]?[0-9a-f]{1,3}))'/i,greedy:!0},string:[{pattern:/"(?:\\(?:[\s\S]|\r\n)|[^\\\r\n"])*"/,greedy:!0},{pattern:/\{([a-z_]*)\|[\s\S]*?\|\1\}/,greedy:!0}],number:[/\b(?:0b[01][01_]*|0o[0-7][0-7_]*)\b/i,/\b0x[a-f0-9][a-f0-9_]*(?:\.[a-f0-9_]*)?(?:p[+-]?\d[\d_]*)?(?!\w)/i,/\b\d[\d_]*(?:\.[\d_]*)?(?:e[+-]?\d[\d_]*)?(?!\w)/i],directive:{pattern:/\B#\w+/,alias:"property"},label:{pattern:/\B~\w+/,alias:"property"},"type-variable":{pattern:/\B'\w+/,alias:"function"},variant:{pattern:/`\w+/,alias:"symbol"},keyword:/\b(?:as|assert|begin|class|constraint|do|done|downto|else|end|exception|external|for|fun|function|functor|if|in|include|inherit|initializer|lazy|let|match|method|module|mutable|new|nonrec|object|of|open|private|rec|sig|struct|then|to|try|type|val|value|virtual|when|where|while|with)\b/,boolean:/\b(?:false|true)\b/,"operator-like-punctuation":{pattern:/\[[<>|]|[>|]\]|\{<|>\}/,alias:"punctuation"},operator:/\.[.~]|:[=>]|[=<>@^|&+\-*\/$%!?~][!$%&*+\-.\/:<=>?@^|~]*|\b(?:and|asr|land|lor|lsl|lsr|lxor|mod|or)\b/,punctuation:/;;|::|[(){}\[\].,:;#]|\b_\b/};
\ No newline at end of file
diff --git a/tests/languages/ocaml/char_feature.test b/tests/languages/ocaml/char_feature.test
new file mode 100644
index 0000000..57dbbd9
--- /dev/null
+++ b/tests/languages/ocaml/char_feature.test
@@ -0,0 +1,15 @@
+'a'
+'\n'
+'\''
+'\xA9'
+'\169'
+
+----------------------------------------------------
+
+[
+	["char", "'a'"],
+	["char", "'\\n'"],
+	["char", "'\\''"],
+	["char", "'\\xA9'"],
+	["char", "'\\169'"]
+]
diff --git a/tests/languages/ocaml/module_feature.test b/tests/languages/ocaml/module_feature.test
deleted file mode 100644
index 006d259..0000000
--- a/tests/languages/ocaml/module_feature.test
+++ /dev/null
@@ -1,15 +0,0 @@
-Foo
-Bar42
-Baz_42
-
-----------------------------------------------------
-
-[
-	["module", "Foo"],
-	["module", "Bar42"],
-	["module", "Baz_42"]
-]
-
-----------------------------------------------------
-
-Checks for modules.
\ No newline at end of file
diff --git a/tests/languages/ocaml/number_feature.test b/tests/languages/ocaml/number_feature.test
index 6ade211..5975858 100644
--- a/tests/languages/ocaml/number_feature.test
+++ b/tests/languages/ocaml/number_feature.test
@@ -5,9 +5,13 @@
 0b1010_1111
 42_000
 3.14_15_9
+3.141_592_653_589_793_12
+1e-5
 3.2e8
 6.1E-7
+2.22044604925031308e-16
 0.4e+12_415
+0x1p-52
 
 ----------------------------------------------------
 
@@ -19,11 +23,15 @@
 	["number", "0b1010_1111"],
 	["number", "42_000"],
 	["number", "3.14_15_9"],
+	["number", "3.141_592_653_589_793_12"],
+	["number", "1e-5"],
 	["number", "3.2e8"],
 	["number", "6.1E-7"],
-	["number", "0.4e+12_415"]
+	["number", "2.22044604925031308e-16"],
+	["number", "0.4e+12_415"],
+	["number", "0x1p-52"]
 ]
 
 ----------------------------------------------------
 
-Checks for numbers.
\ No newline at end of file
+Checks for numbers.
diff --git a/tests/languages/ocaml/operator_feature.test b/tests/languages/ocaml/operator_feature.test
index 50047fe..47ea03e 100644
--- a/tests/languages/ocaml/operator_feature.test
+++ b/tests/languages/ocaml/operator_feature.test
@@ -2,11 +2,12 @@ and asr land
 lor lsl lsr
 lxor mod or
 
-:=
+:= :>
 = < > @
-^ | & ~
+^ | & ~ .~
 + - * /
 $ % ! ?
+..
 
 ~=~
 
@@ -18,14 +19,34 @@ $ % ! ?
 	["operator", "lxor"], ["operator", "mod"], ["operator", "or"],
 
 	["operator", ":="],
-	["operator", "="], ["operator", "<"], ["operator", ">"], ["operator", "@"],
-	["operator", "^"], ["operator", "|"], ["operator", "&"], ["operator", "~"],
-	["operator", "+"], ["operator", "-"], ["operator", "*"], ["operator", "/"],
-	["operator", "$"], ["operator", "%"], ["operator", "!"], ["operator", "?"],
+	["operator", ":>"],
+
+	["operator", "="],
+	["operator", "<"],
+	["operator", ">"],
+	["operator", "@"],
+
+	["operator", "^"],
+	["operator", "|"],
+	["operator", "&"],
+	["operator", "~"],
+	["operator", ".~"],
+
+	["operator", "+"],
+	["operator", "-"],
+	["operator", "*"],
+	["operator", "/"],
+
+	["operator", "$"],
+	["operator", "%"],
+	["operator", "!"],
+	["operator", "?"],
+
+	["operator", ".."],
 
 	["operator", "~=~"]
 ]
 
 ----------------------------------------------------
 
-Checks for operators.
\ No newline at end of file
+Checks for operators.
diff --git a/tests/languages/ocaml/punctuation_feature.test b/tests/languages/ocaml/punctuation_feature.test
index 48b2a53..8c607e0 100644
--- a/tests/languages/ocaml/punctuation_feature.test
+++ b/tests/languages/ocaml/punctuation_feature.test
@@ -1,6 +1,12 @@
 ( ) { } [ ]
 . , : ;
 _
+:: ;;
+
+[< [> [| {<
+>] >} |]
+
+#
 
 ----------------------------------------------------
 
@@ -17,5 +23,19 @@ _
 	["punctuation", ":"],
 	["punctuation", ";"],
 
-	["punctuation", "_"]
+	["punctuation", "_"],
+
+	["punctuation", "::"],
+	["punctuation", ";;"],
+
+	["operator-like-punctuation", "[<"],
+	["operator-like-punctuation", "[>"],
+	["operator-like-punctuation", "[|"],
+	["operator-like-punctuation", "{<"],
+
+	["operator-like-punctuation", ">]"],
+	["operator-like-punctuation", ">}"],
+	["operator-like-punctuation", "|]"],
+
+	["punctuation", "#"]
 ]
diff --git a/tests/languages/ocaml/string_feature.test b/tests/languages/ocaml/string_feature.test
index b7eabd4..bfaef71 100644
--- a/tests/languages/ocaml/string_feature.test
+++ b/tests/languages/ocaml/string_feature.test
@@ -1,25 +1,31 @@
 ""
 "Fo\"obar"
-'\''
-'\123'
-'\xf4'
-`\``
-`\123`
-`\xf4`
+"Call me Ishmael. Some years ago — never mind how long \
+precisely — having little or no money in my purse, and \
+nothing particular to interest me on shore, I thought I\
+\ would sail about a little and see the watery part of t\
+he world."
+
+{|This is a quoted string, here, neither \ nor " are special characters|}
+{|"Hello, World!"|}
+{|"\\"|}
+{delimiter|the end of this|}quoted string is here|delimiter}
+{ext|hello {|world|}|ext}
 
 ----------------------------------------------------
 
 [
 	["string", "\"\""],
 	["string", "\"Fo\\\"obar\""],
-	["string", "'\\''"],
-	["string", "'\\123'"],
-	["string", "'\\xf4'"],
-	["string", "`\\``"],
-	["string", "`\\123`"],
-	["string", "`\\xf4`"]
+	["string", "\"Call me Ishmael. Some years ago — never mind how long \\\r\nprecisely — having little or no money in my purse, and \\\r\nnothing particular to interest me on shore, I thought I\\\r\n\\ would sail about a little and see the watery part of t\\\r\nhe world.\""],
+
+	["string", "{|This is a quoted string, here, neither \\ nor \" are special characters|}"],
+	["string", "{|\"Hello, World!\"|}"],
+	["string", "{|\"\\\\\"|}"],
+	["string", "{delimiter|the end of this|}quoted string is here|delimiter}"],
+	["string", "{ext|hello {|world|}|ext}"]
 ]
 
 ----------------------------------------------------
 
-Checks for strings.
\ No newline at end of file
+Checks for strings.