Hash :
46f05ea4
Author :
Date :
2025-05-09T00:21:47
html: Rework meta charset handling Don't use encoding from meta tags when serializing. Only use the value in `doc->encoding`, matching the XML serializer. This is the actual encoding used when parsing. Stop modifying the input document by setting meta tags before serializing. Meta tags are now injected during serialization. Add full support for <meta charset=""> which is also used when adding meta tags. Align with HTML5 and implement the "algorithm for extracting a character encoding from a meta element". Only modify the encoding substring in Content-Type meta tags. Only switch encoding once when parsing. Fix htmlSaveFileFormat with a NULL encoding not to declare a misleading UTF-8 charset. Fixes #909.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
#!/usr/bin/env python3
import sys
import setup_test
import libxml2
# Memory debug specific
libxml2.debugMemory(1)
#
# Testing XML document serialization
#
doc = libxml2.parseDoc("""<root><foo>hello</foo></root>""")
str = doc.serialize()
if str != """<?xml version="1.0"?>
<root><foo>hello</foo></root>
""":
print("error serializing XML document 1")
sys.exit(1)
str = doc.serialize("iso-8859-1")
if str != """<?xml version="1.0" encoding="iso-8859-1"?>
<root><foo>hello</foo></root>
""":
print("error serializing XML document 2")
sys.exit(1)
str = doc.serialize(format=1)
if str != """<?xml version="1.0"?>
<root>
<foo>hello</foo>
</root>
""":
print("error serializing XML document 3")
sys.exit(1)
str = doc.serialize("iso-8859-1", 1)
if str != """<?xml version="1.0" encoding="iso-8859-1"?>
<root>
<foo>hello</foo>
</root>
""":
print("error serializing XML document 4")
sys.exit(1)
#
# Test serializing a subnode
#
root = doc.getRootElement()
str = root.serialize()
if str != """<root><foo>hello</foo></root>""":
print("error serializing XML root 1")
sys.exit(1)
str = root.serialize("iso-8859-1")
if str != """<root><foo>hello</foo></root>""":
print("error serializing XML root 2")
sys.exit(1)
str = root.serialize(format=1)
if str != """<root>
<foo>hello</foo>
</root>""":
print("error serializing XML root 3")
sys.exit(1)
str = root.serialize("iso-8859-1", 1)
if str != """<root>
<foo>hello</foo>
</root>""":
print("error serializing XML root 4")
sys.exit(1)
doc.freeDoc()
#
# Testing HTML document serialization
#
doc = libxml2.htmlParseDoc("""<html><head><title>Hello</title><body><p>hello</body></html>""", None)
str = doc.serialize()
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><title>Hello</title></head><body><p>hello</p></body></html>
""":
print("error serializing HTML document 1")
sys.exit(1)
str = doc.serialize("ISO-8859-1")
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta charset="ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>
""":
print("error serializing HTML document 2")
sys.exit(1)
str = doc.serialize(format=1)
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head><title>Hello</title></head>
<body><p>hello</p></body>
</html>
""":
print("error serializing HTML document 3")
sys.exit(1)
str = doc.serialize("iso-8859-1", 1)
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<meta charset="iso-8859-1">
<title>Hello</title>
</head>
<body><p>hello</p></body>
</html>
""":
print("error serializing HTML document 4", str)
sys.exit(1)
#
# Test serializing a subnode
#
doc.htmlSetMetaEncoding(None)
root = doc.getRootElement()
str = root.serialize()
if str != """<html><head><title>Hello</title></head><body><p>hello</p></body></html>""":
print("error serializing HTML root 1")
sys.exit(1)
str = root.serialize("ISO-8859-1")
if str != """<html><head><meta charset="ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>""":
print("error serializing HTML root 2")
sys.exit(1)
str = root.serialize(format=1)
if str != """<html>
<head><title>Hello</title></head>
<body><p>hello</p></body>
</html>""":
print("error serializing HTML root 3")
sys.exit(1)
str = root.serialize("iso-8859-1", 1)
if str != """<html>
<head>
<meta charset="iso-8859-1">
<title>Hello</title>
</head>
<body><p>hello</p></body>
</html>""":
print("error serializing HTML root 4")
sys.exit(1)
doc.freeDoc()
# Memory debug specific
libxml2.cleanupParser()
if libxml2.debugMemory(1) == 0:
print("OK")
else:
print("Memory leak %d bytes" % (libxml2.debugMemory(1)))