[docmaker] Recognise URLs. * src/tools/docmaker/tohtml.py (re_url): New regular expression. (make_html_para): Use it.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
diff --git a/ChangeLog b/ChangeLog
index 5cda57a..7425941 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2013-06-25 Werner Lemberg <wl@gnu.org>
+
+ [docmaker] Recognise URLs.
+
+ * src/tools/docmaker/tohtml.py (re_url): New regular expression.
+ (make_html_para): Use it.
+
2013-06-19 Werner Lemberg <wl@gnu.org>
* Version 2.5.0.1 released.
diff --git a/src/tools/docmaker/tohtml.py b/src/tools/docmaker/tohtml.py
index 1cbda75..2ff44d6 100644
--- a/src/tools/docmaker/tohtml.py
+++ b/src/tools/docmaker/tohtml.py
@@ -1,11 +1,46 @@
-# ToHTML (c) 2002, 2003, 2005, 2006, 2007, 2008
+# ToHTML (c) 2002, 2003, 2005-2008, 2013
# David Turner <david@freetype.org>
from sources import *
from content import *
from formatter import *
-import time
+import time, re
+
+
+# this regular expression code to identify an URL has been taken from
+#
+# http://mail.python.org/pipermail/tutor/2002-September/017228.html
+#
+# (with slight modifications)
+
+urls = r'(?:https?|telnet|gopher|file|wais|ftp)'
+ltrs = r'\w'
+gunk = r'/#~:.?+=&%@!\-'
+punc = r'.:?\-'
+any = "%(ltrs)s%(gunk)s%(punc)s" % { 'ltrs' : ltrs,
+ 'gunk' : gunk,
+ 'punc' : punc }
+url = r"""
+ (
+ \b # start at word boundary
+ %(urls)s : # need resource and a colon
+ [%(any)s] +? # followed by one or more of any valid
+ # character, but be conservative and
+ # take only what you need to...
+ (?= # [look-ahead non-consumptive assertion]
+ [%(punc)s]* # either 0 or more punctuation
+ (?: # [non-grouping parentheses]
+ [^%(any)s] | $ # followed by a non-url char
+ # or end of the string
+ )
+ )
+ )
+ """ % {'urls' : urls,
+ 'any' : any,
+ 'punc' : punc }
+
+re_url = re.compile( url, re.VERBOSE | re.MULTILINE )
# The following defines the HTML header used by all generated pages.
@@ -291,6 +326,8 @@ class HtmlFormatter( Formatter ):
line = self.make_html_word( words[0] )
for word in words[1:]:
line = line + " " + self.make_html_word( word )
+ # handle hyperlinks
+ line = re_url.sub( r'<a href="\1">\1</a>', line )
# convert `...' quotations into real left and right single quotes
line = re.sub( r"(^|\W)`(.*?)'(\W|$)", \
r'\1‘\2’\3', \