Edit

kc3-lang/libxml2/HTMLtree.c

Branch :

  • Show log

    Commit

  • Author : Nick Wellnhofer
    Date : 2025-05-28 16:02:41
    Hash : 6a6a46f0
    Message : doc: Fix autolink errors Fix links, remove links to internal functions.

  • HTMLtree.c
  • /*
     * HTMLtree.c : implementation of access function for an HTML tree.
     *
     * See Copyright for the status of this software.
     *
     * Author: Daniel Veillard
     */
    
    
    #define IN_LIBXML
    #include "libxml.h"
    #ifdef LIBXML_HTML_ENABLED
    
    #include <string.h> /* for memset() only ! */
    #include <ctype.h>
    #include <stdlib.h>
    
    #include <libxml/xmlmemory.h>
    #include <libxml/HTMLparser.h>
    #include <libxml/HTMLtree.h>
    #include <libxml/entities.h>
    #include <libxml/xmlerror.h>
    #include <libxml/parserInternals.h>
    #include <libxml/uri.h>
    
    #include "private/buf.h"
    #include "private/html.h"
    #include "private/error.h"
    #include "private/html.h"
    #include "private/io.h"
    #include "private/save.h"
    #include "private/tree.h"
    
    /************************************************************************
     *									*
     *		Getting/Setting encoding meta tags			*
     *									*
     ************************************************************************/
    
    typedef struct {
        xmlAttrPtr attr; /* charset or content */
        const xmlChar *attrValue;
        htmlMetaEncodingOffsets off;
    } htmlMetaEncoding;
    
    static htmlNodePtr
    htmlFindFirstChild(htmlNodePtr parent, const char *name) {
        htmlNodePtr child;
    
        for (child = parent->children; child != NULL; child = child->next) {
            if ((child->type == XML_ELEMENT_NODE) &&
                (xmlStrcasecmp(child->name, BAD_CAST name) == 0))
                return(child);
        }
    
        return(NULL);
    }
    
    static htmlNodePtr
    htmlFindHead(htmlDocPtr doc) {
        htmlNodePtr html;
    
        if (doc == NULL)
            return(NULL);
    
        html = htmlFindFirstChild((htmlNodePtr) doc, "html");
        if (html == NULL)
            return(NULL);
    
        return(htmlFindFirstChild(html, "head"));
    }
    
    int
    htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) {
        const xmlChar *p = val;
    
        while (1) {
            size_t start, end;
    
            while ((*p != 'c') && (*p != 'C')) {
                if (*p == 0)
                    return(0);
                p += 1;
            }
            p += 1;
    
            if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0)
                continue;
    
            p += 6;
            while (IS_WS_HTML(*p)) p += 1;
    
            if (*p != '=')
                continue;
    
            p += 1;
            while (IS_WS_HTML(*p)) p += 1;
    
            if (*p == 0)
                return(0);
    
            if ((*p == '"') || (*p == '\'')) {
                int quote = *p;
    
                p += 1;
                while (IS_WS_HTML(*p)) p += 1;
    
                start = p - val;
                end = start;
    
                while (*p != quote) {
                    if (*p == 0)
                        return(0);
                    if (!IS_WS_HTML(*p))
                        end = p + 1 - val;
                    p += 1;
                }
            } else {
                start = p - val;
    
                while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p)))
                    p += 1;
    
                end = p - val;
            }
    
            off->start = start;
            off->end = end;
            off->size = p - val + strlen((char *) p);
    
            return(1);
        }
    
        return(0);
    }
    
    static xmlAttrPtr
    htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) {
        xmlAttrPtr attr, contentAttr = NULL;
        int isContentType = 0;
    
        if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)
            return(NULL);
    
        for (attr = elem->properties; attr != NULL; attr = attr->next) {
            if (attr->ns != NULL)
                continue;
            if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) {
                *outIsContentType = 0;
                return(attr);
            }
            if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0)
                contentAttr = attr;
            if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) &&
                (attr->children != NULL) &&
                (attr->children->type == XML_TEXT_NODE) &&
                (attr->children->next == NULL) &&
                (xmlStrcasecmp(attr->children->content,
                               BAD_CAST "Content-Type") == 0))
                isContentType = 1;
        }
    
        if ((isContentType) && (contentAttr != NULL)) {
            *outIsContentType = 1;
            return(contentAttr);
        }
    
        return(NULL);
    }
    
    static int
    htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) {
        xmlAttrPtr attr;
        const xmlChar *val = NULL;
        int isContentType;
    
        if ((elem->type != XML_ELEMENT_NODE) ||
            (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0))
            return(0);
    
        attr = htmlFindMetaEncodingAttr(elem, &isContentType);
        if (attr == NULL)
            return(0);
    
        if ((attr->children != NULL) &&
            (attr->children->type == XML_TEXT_NODE) &&
            (attr->children->next == NULL) &&
            (attr->children->content != NULL))
            val = attr->children->content;
        else
            val = BAD_CAST "";
    
    
        if (!isContentType) {
            size_t size = strlen((char *) val);
            size_t start = 0;
            size_t end = size;
    
            while ((start < size) && (IS_WS_HTML(val[start])))
                start += 1;
    
            while ((end > 0) && (IS_WS_HTML(val[end-1])))
                end -= 1;
    
            menc->attr = attr;
            menc->attrValue = val;
            menc->off.start = start;
            menc->off.end = end;
            menc->off.size = size;
    
            return(1);
        } else {
            if (htmlParseContentType(val, &menc->off)) {
                menc->attr = attr;
                menc->attrValue = val;
    
                return(1);
            }
        }
    
        return(0);
    }
    
    static xmlChar *
    htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) {
        xmlChar *newVal, *p;
        size_t size, oldEncSize, newEncSize;
    
        /*
         * The pseudo "HTML" encoding only produces ASCII.
         */
        if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0)
            encoding = "ASCII";
    
        oldEncSize = menc->off.end - menc->off.start;
        newEncSize = strlen((char *) encoding);
        size = menc->off.size - oldEncSize + newEncSize;
        newVal = xmlMalloc(size + 1);
        if (newVal == NULL)
            return(NULL);
    
        p = newVal;
        memcpy(p, menc->attrValue, menc->off.start);
        p += menc->off.start;
        memcpy(p, encoding, newEncSize);
        p += newEncSize;
        memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end);
        newVal[size] = 0;
    
        return(newVal);
    }
    
    /**
     * Look up and encoding declaration in the meta tags.
     *
     * The returned string points into attribute content and can contain
     * trailing garbage. It should be copied before modifying or freeing
     * nodes.
     *
     * @param doc  the document
     * @returns the encoding ot NULL if not found.
     */
    const xmlChar *
    htmlGetMetaEncoding(xmlDoc *doc) {
        htmlNodePtr head, node;
    
        head = htmlFindHead(doc);
        if (head == NULL)
            return(NULL);
    
        for (node = head->children; node != NULL; node = node->next) {
            htmlMetaEncoding menc;
    
            if (htmlParseMetaEncoding(node, &menc)) {
                /*
                 * Returning a `const xmlChar *` only allows to return
                 * a suffix. In http-equiv meta tags, there could be
                 * more data after the charset, although it's probably
                 * rare in practice.
                 */
                return(menc.attrValue + menc.off.start);
            }
        }
    
        return(NULL);
    }
    
    /**
     * Creates or updates a meta tag with an encoding declaration.
     *
     * NOTE: This will not change the document content encoding.
     *
     * @param doc  the document
     * @param encoding  the encoding string
     * @returns 0 in case of success, 1 if no head element was found or
     * arguments are invalid and -1 if memory allocation failed.
     */
    int
    htmlSetMetaEncoding(xmlDoc *doc, const xmlChar *encoding) {
        htmlNodePtr head, meta;
        int found = 0;
    
        if (encoding == NULL)
            return(1);
    
        head = htmlFindHead(doc);
        if (head == NULL)
            return(1);
    
        for (meta = head->children; meta != NULL; meta = meta->next) {
            htmlMetaEncoding menc;
    
            if (htmlParseMetaEncoding(meta, &menc)) {
                xmlChar *newVal;
                int ret;
    
                found = 1;
    
                newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding);
                if (newVal == NULL)
                    return(-1);
                xmlNodeSetContent((xmlNodePtr) menc.attr, NULL);
                ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal);
                xmlFree(newVal);
    
                if (ret < 0)
                    return(-1);
            }
        }
    
        if (found)
            return(0);
    
        meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL);
        if (meta == NULL)
            return(-1);
    
        if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) {
            xmlFreeNode(meta);
            return(-1);
        }
    
        if (head->children == NULL)
            xmlAddChild(head, meta);
        else
            xmlAddPrevSibling(head->children, meta);
    
        return(0);
    }
    
    /**
     * Determine if a given attribute is a boolean attribute. This
     * doesn't handle HTML5.
     *
     * @deprecated Internal function, don't use.
     *
     * @param name  the name of the attribute to check
     * @returns false if the attribute is not boolean, true otherwise.
     */
    int
    htmlIsBooleanAttr(const xmlChar *name)
    {
        const char *str = NULL;
    
        if (name == NULL)
            return(0);
    
        /*
         * These are the HTML attributes which will be output
         * in minimized form, i.e. `<option selected="selected">` will be
         * output as `<option selected>`, as per XSLT 1.0 16.2 "HTML Output
         * Method":
         *
         * "checked", "compact", "declare", "defer", "disabled", "ismap",
         * "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
         * "selected"
         *
         * Additional attributes from HTML5 (not implemented yet):
         *
         * "allowfullscreen", "alpha", "async", "autofocus", "autoplay",
         * "controls", "default", "formnovalidate", "inert", "itemscope",
         * "loop", "muted", "nomodule", "novalidate", "open", "playsinline",
         * "required", "reversed", "shadowrootdelegatesfocus",
         * "shadowrootclonable", "shadowrootserializable",
         * "shadowrootcustomelementregistry", "truespeed"
         */
    
        switch (name[0] | 0x20) {
            case 'c':
                name += 1;
                switch (name[0] | 0x20) {
                    case 'h': str = "ecked"; break;
                    case 'o': str = "mpact"; break;
                }
                break;
            case 'd':
                name += 1;
                switch (name[0] | 0x20) {
                    case 'e':
                        name += 1;
                        switch (name[0] | 0x20) {
                            case 'c': str = "lare"; break;
                            case 'f': str = "er"; break;
                        }
                        break;
                    case 'i': str = "sabled"; break;
                }
                break;
            case 'i':
                str = "smap";
                break;
            case 'm':
                str = "ultiple";
                break;
            case 'n':
                name += 1;
                if ((name[0] | 0x20) != 'o')
                    break;
                name += 1;
                switch (name[0] | 0x20) {
                    case 'h': str = "ref"; break;
                    case 'r': str = "esize"; break;
                    case 's': str = "hade"; break;
                    case 'w': str = "rap"; break;
                }
                break;
            case 'r':
                str = "eadonly";
                break;
            case 's':
                str = "elected";
                break;
        }
    
        if (str == NULL)
            return(0);
    
        return(xmlStrcasecmp(name + 1, BAD_CAST str) == 0);
    }
    
    #ifdef LIBXML_OUTPUT_ENABLED
    /************************************************************************
     *									*
     *		Dumping HTML tree content to a simple buffer		*
     *									*
     ************************************************************************/
    
    static xmlParserErrors
    htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) {
        /*
         * Fallback to HTML if the encoding is unspecified
         */
        if (encoding == NULL)
            encoding = "HTML";
    
        return(xmlOpenCharEncodingHandler(encoding, /* output */ 1, out));
    }
    
    /**
     * Serialize an HTML document to an xmlBuf.
     *
     * @param buf  the xmlBuf output
     * @param doc  the document (unused)
     * @param cur  the current node
     * @param format  should formatting newlines been added
     * @returns the number of bytes written or -1 in case of error
     */
    static size_t
    htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc ATTRIBUTE_UNUSED,
                          xmlNodePtr cur, int format) {
        size_t use;
        size_t ret;
        xmlOutputBufferPtr outbuf;
    
        if (cur == NULL) {
    	return ((size_t) -1);
        }
        if (buf == NULL) {
    	return ((size_t) -1);
        }
        outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
        if (outbuf == NULL)
    	return ((size_t) -1);
        memset(outbuf, 0, sizeof(xmlOutputBuffer));
        outbuf->buffer = buf;
        outbuf->encoder = NULL;
        outbuf->writecallback = NULL;
        outbuf->closecallback = NULL;
        outbuf->context = NULL;
        outbuf->written = 0;
    
        use = xmlBufUse(buf);
        htmlNodeDumpInternal(outbuf, cur, NULL, format);
        if (outbuf->error)
            ret = (size_t) -1;
        else
            ret = xmlBufUse(buf) - use;
        xmlFree(outbuf);
        return (ret);
    }
    
    /**
     * Serialize an HTML node to an xmlBuffer. Always uses UTF-8.
     *
     * @param buf  the HTML buffer output
     * @param doc  the document
     * @param cur  the current node
     * @returns the number of bytes written or -1 in case of error
     */
    int
    htmlNodeDump(xmlBuffer *buf, xmlDoc *doc, xmlNode *cur) {
        xmlBufPtr buffer;
        size_t ret1;
        int ret2;
    
        if ((buf == NULL) || (cur == NULL))
            return(-1);
    
        xmlInitParser();
        buffer = xmlBufFromBuffer(buf);
        if (buffer == NULL)
            return(-1);
    
        ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
    
        ret2 = xmlBufBackToBuffer(buffer, buf);
    
        if ((ret1 == (size_t) -1) || (ret2 < 0))
            return(-1);
        return(ret1 > INT_MAX ? INT_MAX : ret1);
    }
    
    /**
     * Serialize an HTML node to an xmlBuffer.
     *
     * If encoding is NULL, ASCII with HTML 4.0 named character entities
     * will be used. This is inefficient compared to UTF-8 and might be
     * changed in a future version.
     *
     * @param out  the FILE pointer
     * @param doc  the document (unused)
     * @param cur  the current node
     * @param encoding  the document encoding (optional)
     * @param format  should formatting newlines been added
     * @returns the number of bytes written or -1 in case of failure.
     */
    int
    htmlNodeDumpFileFormat(FILE *out, xmlDoc *doc ATTRIBUTE_UNUSED,
    	               xmlNode *cur, const char *encoding, int format) {
        xmlOutputBufferPtr buf;
        xmlCharEncodingHandlerPtr handler;
        int ret;
    
        xmlInitParser();
    
        /*
         * save the content to a temp buffer.
         */
        if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
            return(-1);
        buf = xmlOutputBufferCreateFile(out, handler);
        if (buf == NULL) {
            xmlCharEncCloseFunc(handler);
            return(-1);
        }
    
        htmlNodeDumpInternal(buf, cur, NULL, format);
    
        ret = xmlOutputBufferClose(buf);
        return(ret);
    }
    
    /**
     * Same as #htmlNodeDumpFileFormat with `format` set to 1 which is
     * typically undesired. Use of this function is DISCOURAGED in favor
     * of #htmlNodeDumpFileFormat.
     *
     * @param out  the FILE pointer
     * @param doc  the document
     * @param cur  the current node
     */
    void
    htmlNodeDumpFile(FILE *out, xmlDoc *doc, xmlNode *cur) {
        htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
    }
    
    /**
     * Serialize an HTML node to a memory, also returning the size of
     * the result. It's up to the caller to free the memory.
     *
     * Uses the encoding of the document. If the document has no
     * encoding, ASCII with HTML 4.0 named character entities will
     * be used. This is inefficient compared to UTF-8 and might be
     * changed in a future version.
     *
     * @param cur  the document
     * @param mem  OUT: the memory pointer
     * @param size  OUT: the memory length
     * @param format  should formatting newlines been added
     */
    void
    htmlDocDumpMemoryFormat(xmlDoc *cur, xmlChar**mem, int *size, int format) {
        xmlOutputBufferPtr buf;
        xmlCharEncodingHandlerPtr handler = NULL;
    
        xmlInitParser();
    
        if ((mem == NULL) || (size == NULL))
            return;
        *mem = NULL;
        *size = 0;
        if (cur == NULL)
    	return;
    
        if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
            return;
        buf = xmlAllocOutputBuffer(handler);
        if (buf == NULL) {
            xmlCharEncCloseFunc(handler);
    	return;
        }
    
        htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
    
        xmlOutputBufferFlush(buf);
    
        if (!buf->error) {
            if (buf->conv != NULL) {
                *size = xmlBufUse(buf->conv);
                *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
            } else {
                *size = xmlBufUse(buf->buffer);
                *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
            }
        }
    
        xmlOutputBufferClose(buf);
    }
    
    /**
     * Same as #htmlDocDumpMemoryFormat with `format` set to 1 which
     * is typically undesired. Also see the warnings there. Use of
     * this function is DISCOURAGED in favor of
     * #htmlDocContentDumpFormatOutput.
     *
     * @param cur  the document
     * @param mem  OUT: the memory pointer
     * @param size  OUT: the memory length
     */
    void
    htmlDocDumpMemory(xmlDoc *cur, xmlChar**mem, int *size) {
        htmlDocDumpMemoryFormat(cur, mem, size, 1);
    }
    
    
    /************************************************************************
     *									*
     *		Dumping HTML tree content to an I/O output buffer	*
     *									*
     ************************************************************************/
    
    /**
     * Serialize the HTML document's DTD, if any.
     *
     * Ignores `encoding` and uses the encoding of the output buffer.
     *
     * @param buf  the HTML buffer output
     * @param doc  the document
     * @param encoding  the encoding string (unused)
     */
    static void
    htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
    	          const char *encoding ATTRIBUTE_UNUSED) {
        xmlDtdPtr cur = doc->intSubset;
    
        if (cur == NULL)
    	return;
        xmlOutputBufferWrite(buf, 10, "<!DOCTYPE ");
        xmlOutputBufferWriteString(buf, (const char *)cur->name);
        if (cur->ExternalID != NULL) {
    	xmlOutputBufferWrite(buf, 8, " PUBLIC ");
    	xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
    	if (cur->SystemID != NULL) {
    	    xmlOutputBufferWrite(buf, 1, " ");
    	    xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
    	}
        } else if (cur->SystemID != NULL &&
    	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
    	xmlOutputBufferWrite(buf, 8, " SYSTEM ");
    	xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
        }
        xmlOutputBufferWrite(buf, 2, ">\n");
    }
    
    static void
    htmlSerializeUri(xmlOutputBufferPtr buf, const xmlChar *content) {
        const xmlChar *tmp = content;
    
        /*
         * See appendix "B.2.1 Non-ASCII characters in URI attribute
         * values" in the HTML 4.01 spec. This is also recommended
         * by the HTML output method of the XSLT 1.0 spec.
         *
         * We also escape space and control chars.
         */
    
        /* Skip over initial whitespace */
        while (IS_WS_HTML(*tmp)) tmp++;
        if (tmp > content) {
            xmlOutputBufferWrite(buf, tmp - content, (char *) content);
            content = tmp;
        }
    
        while (1) {
            char escbuf[3];
            const char *repl;
            int replSize;
            int c = *tmp;
    
            while ((c > 0x20) && (c < 0x7F) && (c != '"') && (c != '&')) {
                tmp += 1;
                c = *tmp;
            }
    
            if (tmp > content)
                xmlOutputBufferWrite(buf, tmp - content, (char *) content);
    
            if ((c <= 0x20) || (c >= 0x7F)) {
                static const char hex[16] = {
                    '0', '1', '2', '3', '4', '5', '6', '7',
                    '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
                };
    
                if (c == 0)
                    break;
    
                escbuf[0] = '%';
                escbuf[1] = hex[(c >> 4) & 0x0F];
                escbuf[2] = hex[c & 0x0F];
                repl = escbuf;
                replSize = 3;
            } else if (c == '"') {
                repl = "&quot;";
                replSize = 6;
            } else {
                repl = "&amp;";
                replSize = 5;
            }
    
            xmlOutputBufferWrite(buf, replSize, repl);
            tmp += 1;
            content = tmp;
        }
    }
    
    /**
     * Serialize an HTML attribute.
     *
     * @param buf  the HTML buffer output
     * @param cur  the attribute pointer
     */
    static void
    htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlAttrPtr cur) {
        xmlOutputBufferWrite(buf, 1, " ");
    
        if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
            xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
            xmlOutputBufferWrite(buf, 1, ":");
        }
        xmlOutputBufferWriteString(buf, (const char *)cur->name);
    
        /*
         * The HTML5 spec requires to always serialize empty attribute
         * values as `=""`. We should probably align with HTML5 at some
         * point.
         */
        if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
            xmlNodePtr child;
            int isUri;
    
            xmlOutputBufferWrite(buf, 2, "=\"");
    
            /*
             * Special handling of URIs doesn't conform to HTML5 and
             * should probably be removed at some point.
             */
            isUri = (cur->ns == NULL) && (cur->parent != NULL) &&
                    (cur->parent->ns == NULL) &&
                    ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
                     (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
                     (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
                     ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
                      (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))));
    
            for (child = cur->children; child != NULL; child = child->next) {
                if (child->type == XML_TEXT_NODE) {
                    const xmlChar *content = child->content;
    
                    if (content == NULL)
                        continue;
    
                    if (isUri) {
                        htmlSerializeUri(buf, content);
                    } else {
                        xmlSerializeText(buf, content, SIZE_MAX,
                                         XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                    }
                } else if (child->type == XML_ENTITY_REF_NODE) {
                    /* TODO: We should probably expand entity refs */
                    xmlOutputBufferWrite(buf, 1, "&");
                    xmlOutputBufferWriteString(buf, (char *) child->name);
                    xmlOutputBufferWrite(buf, 1, ";");
                }
            }
    
            xmlOutputBufferWrite(buf, 1, "\"");
        }
    }
    
    /**
     * Serialize an HTML node to an output buffer.
     *
     * If `encoding` is specified, it is used to create or update meta
     * tags containing the character encoding.
     *
     * @param buf  the HTML buffer output
     * @param cur  the current node
     * @param encoding  the encoding string (optional)
     * @param format  should formatting newlines been added
     */
    void
    htmlNodeDumpInternal(xmlOutputBuffer *buf, xmlNode *cur,
                         const char *encoding, int format) {
        xmlNodePtr root, parent, metaHead = NULL;
        xmlAttrPtr attr;
        const htmlElemDesc * info;
        int isRaw = 0;
    
        xmlInitParser();
    
        if ((cur == NULL) || (buf == NULL)) {
    	return;
        }
    
        root = cur;
        parent = cur->parent;
        while (1) {
            switch (cur->type) {
            case XML_HTML_DOCUMENT_NODE:
            case XML_DOCUMENT_NODE:
                if (((xmlDocPtr) cur)->intSubset != NULL) {
                    htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
                }
                if (cur->children != NULL) {
                    /* Always validate cur->parent when descending. */
                    if (cur->parent == parent) {
                        parent = cur;
                        cur = cur->children;
                        continue;
                    }
                } else {
                    xmlOutputBufferWrite(buf, 1, "\n");
                }
                break;
    
            case XML_ELEMENT_NODE: {
                htmlMetaEncoding menc;
                int isMeta = 0;
                int addMeta = 0;
    
                /*
                 * Some users like lxml are known to pass nodes with a corrupted
                 * tree structure. Fall back to a recursive call to handle this
                 * case.
                 */
                if ((cur->parent != parent) && (cur->children != NULL)) {
                    htmlNodeDumpInternal(buf, cur, encoding, format);
                    break;
                }
    
                /*
                 * Get specific HTML info for that node.
                 */
                if (cur->ns == NULL)
                    info = htmlTagLookup(cur->name);
                else
                    info = NULL;
    
                if (encoding != NULL) {
                    isMeta = htmlParseMetaEncoding(cur, &menc);
    
                    /*
                     * Don't add meta tag for "HTML" encoding.
                     */
                    if ((xmlStrcasecmp(BAD_CAST encoding,
                                       BAD_CAST "HTML") != 0) &&
                        (xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) &&
                        (parent != NULL) &&
                        (xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) &&
                        (parent->parent != NULL) &&
                        (parent->parent->parent == NULL) &&
                        (metaHead == NULL)) {
                        xmlNodePtr n;
    
                        metaHead = cur;
                        addMeta = 1;
    
                        for (n = cur->children; n != NULL; n = n->next) {
                            int unused;
    
                            if (htmlFindMetaEncodingAttr(n, &unused) != NULL) {
                                metaHead = NULL;
                                addMeta = 0;
                                break;
                            }
                        }
                    }
                }
    
                xmlOutputBufferWrite(buf, 1, "<");
                if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
                    xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
                    xmlOutputBufferWrite(buf, 1, ":");
                }
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
                if (cur->nsDef)
                    xmlNsListDumpOutput(buf, cur->nsDef);
                attr = cur->properties;
                while (attr != NULL) {
                    if ((!isMeta) || (attr != menc.attr)) {
                        htmlAttrDumpOutput(buf, attr);
                    } else {
                        xmlOutputBufferWrite(buf, 1, " ");
                        xmlOutputBufferWriteString(buf, (char *) attr->name);
    
                        xmlOutputBufferWrite(buf, 2, "=\"");
                        xmlSerializeText(buf, menc.attrValue, menc.off.start,
                                         XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                        xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
                                         XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                        xmlSerializeText(buf, menc.attrValue + menc.off.end,
                                         menc.off.size - menc.off.end,
                                         XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                        xmlOutputBufferWrite(buf, 1, "\"");
                    }
                    attr = attr->next;
                }
    
                if ((info != NULL) && (info->empty)) {
                    xmlOutputBufferWrite(buf, 1, ">");
                } else if (cur->children == NULL) {
                    if (addMeta) {
                        xmlOutputBufferWrite(buf, 16, "><meta charset=\"");
                        xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
                                         XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                        xmlOutputBufferWrite(buf, 4, "\"></");
                    } else {
                        xmlOutputBufferWrite(buf, 3, "></");
                    }
                    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
                        xmlOutputBufferWriteString(buf,
                                (const char *)cur->ns->prefix);
                        xmlOutputBufferWrite(buf, 1, ":");
                    }
                    xmlOutputBufferWriteString(buf, (const char *)cur->name);
                    xmlOutputBufferWrite(buf, 1, ">");
                } else {
                    xmlOutputBufferWrite(buf, 1, ">");
                    if ((format) &&
                        ((addMeta) ||
                         ((info != NULL) && (!info->isinline) &&
                          (cur->children->type != HTML_TEXT_NODE) &&
                          (cur->children->type != HTML_ENTITY_REF_NODE) &&
                          (cur->children != cur->last) &&
                          (cur->name != NULL) &&
                          (cur->name[0] != 'p')))) /* p, pre, param */
                        xmlOutputBufferWrite(buf, 1, "\n");
                    if (addMeta) {
                        xmlOutputBufferWrite(buf, 15, "<meta charset=\"");
                        xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
                                         XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                        xmlOutputBufferWrite(buf, 2, "\">");
                        if ((format) &&
                            (cur->children->type != HTML_TEXT_NODE) &&
                            (cur->children->type != HTML_ENTITY_REF_NODE))
                            xmlOutputBufferWrite(buf, 1, "\n");
                    }
    
                    if ((info != NULL) && (info->dataMode >= DATA_RAWTEXT))
                        isRaw = 1;
    
                    parent = cur;
                    cur = cur->children;
                    continue;
                }
    
                if ((format) && (cur->next != NULL) &&
                    (info != NULL) && (!info->isinline)) {
                    if ((cur->next->type != HTML_TEXT_NODE) &&
                        (cur->next->type != HTML_ENTITY_REF_NODE) &&
                        (parent != NULL) &&
                        (parent->name != NULL) &&
                        (parent->name[0] != 'p')) /* p, pre, param */
                        xmlOutputBufferWrite(buf, 1, "\n");
                }
    
                break;
            }
    
            case XML_ATTRIBUTE_NODE:
                htmlAttrDumpOutput(buf, (xmlAttrPtr) cur);
                break;
    
            case HTML_TEXT_NODE:
                if (cur->content == NULL)
                    break;
                if ((cur->name == (const xmlChar *)xmlStringTextNoenc) ||
                    (isRaw)) {
                    xmlOutputBufferWriteString(buf, (const char *)cur->content);
                } else {
                    xmlSerializeText(buf, cur->content, SIZE_MAX, XML_ESCAPE_HTML);
                }
                break;
    
            case HTML_COMMENT_NODE:
                if (cur->content != NULL) {
                    xmlOutputBufferWrite(buf, 4, "<!--");
                    xmlOutputBufferWriteString(buf, (const char *)cur->content);
                    xmlOutputBufferWrite(buf, 3, "-->");
                }
                break;
    
            case HTML_PI_NODE:
                if (cur->name != NULL) {
                    xmlOutputBufferWrite(buf, 2, "<?");
                    xmlOutputBufferWriteString(buf, (const char *)cur->name);
                    if (cur->content != NULL) {
                        xmlOutputBufferWrite(buf, 1, " ");
                        xmlOutputBufferWriteString(buf,
                                (const char *)cur->content);
                    }
                    xmlOutputBufferWrite(buf, 1, ">");
                }
                break;
    
            case HTML_ENTITY_REF_NODE:
                xmlOutputBufferWrite(buf, 1, "&");
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
                xmlOutputBufferWrite(buf, 1, ";");
                break;
    
            case HTML_PRESERVE_NODE:
                if (cur->content != NULL) {
                    xmlOutputBufferWriteString(buf, (const char *)cur->content);
                }
                break;
    
            default:
                break;
            }
    
            while (1) {
                if (cur == root)
                    return;
                if (cur->next != NULL) {
                    cur = cur->next;
                    break;
                }
    
                isRaw = 0;
    
                cur = parent;
                /* cur->parent was validated when descending. */
                parent = cur->parent;
    
                if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
                    (cur->type == XML_DOCUMENT_NODE)) {
                    xmlOutputBufferWrite(buf, 1, "\n");
                } else {
                    if ((format) && (cur->ns == NULL))
                        info = htmlTagLookup(cur->name);
                    else
                        info = NULL;
    
                    if ((format) && (info != NULL) && (!info->isinline) &&
                        (cur->last->type != HTML_TEXT_NODE) &&
                        (cur->last->type != HTML_ENTITY_REF_NODE) &&
                        ((cur->children != cur->last) || (cur == metaHead)) &&
                        (cur->name != NULL) &&
                        (cur->name[0] != 'p')) /* p, pre, param */
                        xmlOutputBufferWrite(buf, 1, "\n");
    
                    xmlOutputBufferWrite(buf, 2, "</");
                    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
                        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
                        xmlOutputBufferWrite(buf, 1, ":");
                    }
                    xmlOutputBufferWriteString(buf, (const char *)cur->name);
                    xmlOutputBufferWrite(buf, 1, ">");
    
                    if ((format) && (info != NULL) && (!info->isinline) &&
                        (cur->next != NULL)) {
                        if ((cur->next->type != HTML_TEXT_NODE) &&
                            (cur->next->type != HTML_ENTITY_REF_NODE) &&
                            (parent != NULL) &&
                            (parent->name != NULL) &&
                            (parent->name[0] != 'p')) /* p, pre, param */
                            xmlOutputBufferWrite(buf, 1, "\n");
                    }
    
                    if (cur == metaHead)
                        metaHead = NULL;
                }
            }
        }
    }
    
    /**
     * Serialize an HTML node to an output buffer.
     *
     * @param buf  the HTML buffer output
     * @param doc  the document (unused)
     * @param cur  the current node
     * @param encoding  the encoding string (unused)
     * @param format  should formatting newlines been added
     */
    void
    htmlNodeDumpFormatOutput(xmlOutputBuffer *buf,
                             xmlDoc *doc ATTRIBUTE_UNUSED, xmlNode *cur,
                             const char *encoding ATTRIBUTE_UNUSED, int format) {
        htmlNodeDumpInternal(buf, cur, NULL, format);
    }
    
    /**
     * Same as #htmlNodeDumpFormatOutput with `format` set to 1 which is
     * typically undesired. Use of this function is DISCOURAGED in favor
     * of #htmlNodeDumpFormatOutput.
     *
     * @param buf  the HTML buffer output
     * @param doc  the document (unused)
     * @param cur  the current node
     * @param encoding  the encoding string (unused)
     */
    void
    htmlNodeDumpOutput(xmlOutputBuffer *buf, xmlDoc *doc ATTRIBUTE_UNUSED,
                       xmlNode *cur, const char *encoding ATTRIBUTE_UNUSED) {
        htmlNodeDumpInternal(buf, cur, NULL, 1);
    }
    
    /**
     * Serialize an HTML document to an output buffer.
     *
     * @param buf  the HTML buffer output
     * @param cur  the document
     * @param encoding  the encoding string (unused)
     * @param format  should formatting newlines been added
     */
    void
    htmlDocContentDumpFormatOutput(xmlOutputBuffer *buf, xmlDoc *cur,
    	                       const char *encoding ATTRIBUTE_UNUSED,
                                   int format) {
        htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, format);
    }
    
    /**
     * Same as #htmlDocContentDumpFormatOutput with `format` set to 1
     * which is typically undesired. Use of this function is DISCOURAGED
     * in favor of #htmlDocContentDumpFormatOutput.
     *
     * @param buf  the HTML buffer output
     * @param cur  the document
     * @param encoding  the encoding string (unused)
     */
    void
    htmlDocContentDumpOutput(xmlOutputBuffer *buf, xmlDoc *cur,
    	                 const char *encoding ATTRIBUTE_UNUSED) {
        htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, 1);
    }
    
    /************************************************************************
     *									*
     *		Saving functions front-ends				*
     *									*
     ************************************************************************/
    
    /**
     * Serialize an HTML document to an open `FILE`.
     *
     * Uses the encoding of the document. If the document has no
     * encoding, ASCII with HTML 4.0 named character entities will
     * be used. This is inefficient compared to UTF-8 and might be
     * changed in a future version.
     *
     * Enables "formatting" unconditionally which is typically
     * undesired.
     *
     * Use of this function is DISCOURAGED in favor of
     * #htmlNodeDumpFileFormat.
     *
     * @param f  the FILE*
     * @param cur  the document
     * @returns the number of bytes written or -1 in case of failure.
     */
    int
    htmlDocDump(FILE *f, xmlDoc *cur) {
        xmlOutputBufferPtr buf;
        xmlCharEncodingHandlerPtr handler = NULL;
        int ret;
    
        xmlInitParser();
    
        if ((cur == NULL) || (f == NULL)) {
    	return(-1);
        }
    
        if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
            return(-1);
        buf = xmlOutputBufferCreateFile(f, handler);
        if (buf == NULL) {
            xmlCharEncCloseFunc(handler);
            return(-1);
        }
        htmlDocContentDumpOutput(buf, cur, NULL);
    
        ret = xmlOutputBufferClose(buf);
        return(ret);
    }
    
    /**
     * Serialize an HTML document to a file.
     *
     * Same as #htmlSaveFileFormat with `encoding` set to NULL and
     * `format` set to 1 which is typically undesired.
     *
     * Use of this function is DISCOURAGED in favor of
     * #htmlSaveFileFormat.
     *
     * @param filename  the filename (or URL)
     * @param cur  the document
     * @returns the number of bytes written or -1 in case of failure.
     */
    int
    htmlSaveFile(const char *filename, xmlDoc *cur) {
        return(htmlSaveFileFormat(filename, cur, NULL, 1));
    }
    
    /**
     * Serialize an HTML document to a file using a given encoding.
     *
     * If `filename` is `"-"`, stdout is used. This is potentially
     * insecure and might be changed in a future version.
     *
     * If encoding is NULL, ASCII with HTML 4.0 named character entities
     * will be used. This is inefficient compared to UTF-8 and might be
     * changed in a future version.
     *
     * Sets or updates meta tags containing the character encoding.
     *
     * @param filename  the filename
     * @param cur  the document
     * @param format  should formatting newlines been added
     * @param encoding  the document encoding (optional)
     * @returns the number of bytes written or -1 in case of failure.
     */
    int
    htmlSaveFileFormat(const char *filename, xmlDoc *cur,
    	           const char *encoding, int format) {
        xmlOutputBufferPtr buf;
        xmlCharEncodingHandlerPtr handler = NULL;
        int ret;
    
        if ((cur == NULL) || (filename == NULL))
            return(-1);
    
        xmlInitParser();
    
        if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
            return(-1);
    
        /*
         * save the content to a temp buffer.
         */
        buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
        if (buf == NULL) {
            xmlCharEncCloseFunc(handler);
            return(0);
        }
    
        htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
    
        ret = xmlOutputBufferClose(buf);
        return(ret);
    }
    
    /**
     * Serialize an HTML document to a file.
     *
     * Same as #htmlSaveFileFormat with `format` set to 1 which is
     * typically undesired. Also see the warnings there. Use of this
     * function is DISCOURAGED in favor of #htmlSaveFileFormat.
     *
     * @param filename  the filename
     * @param cur  the document
     * @param encoding  the document encoding
     * @returns the number of bytes written or -1 in case of failure.
     */
    int
    htmlSaveFileEnc(const char *filename, xmlDoc *cur, const char *encoding) {
        return(htmlSaveFileFormat(filename, cur, encoding, 1));
    }
    
    #endif /* LIBXML_OUTPUT_ENABLED */
    
    #endif /* LIBXML_HTML_ENABLED */