HTMLtree.c

Branch :
Show log
Commit
Author : Nick Wellnhofer
Date : 2025-05-28 16:02:41
Hash : 6a6a46f0
Message : doc: Fix autolink errors Fix links, remove links to internal functions.
HTMLtree.c
/*
 * HTMLtree.c : implementation of access function for an HTML tree.
 *
 * See Copyright for the status of this software.
 *
 * Author: Daniel Veillard
 */


#define IN_LIBXML
#include "libxml.h"
#ifdef LIBXML_HTML_ENABLED

#include <string.h> /* for memset() only ! */
#include <ctype.h>
#include <stdlib.h>

#include <libxml/xmlmemory.h>
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/entities.h>
#include <libxml/xmlerror.h>
#include <libxml/parserInternals.h>
#include <libxml/uri.h>

#include "private/buf.h"
#include "private/html.h"
#include "private/error.h"
#include "private/html.h"
#include "private/io.h"
#include "private/save.h"
#include "private/tree.h"

/************************************************************************
 *									*
 *		Getting/Setting encoding meta tags			*
 *									*
 ************************************************************************/

typedef struct {
    xmlAttrPtr attr; /* charset or content */
    const xmlChar *attrValue;
    htmlMetaEncodingOffsets off;
} htmlMetaEncoding;

static htmlNodePtr
htmlFindFirstChild(htmlNodePtr parent, const char *name) {
    htmlNodePtr child;

    for (child = parent->children; child != NULL; child = child->next) {
        if ((child->type == XML_ELEMENT_NODE) &&
            (xmlStrcasecmp(child->name, BAD_CAST name) == 0))
            return(child);
    }

    return(NULL);
}

static htmlNodePtr
htmlFindHead(htmlDocPtr doc) {
    htmlNodePtr html;

    if (doc == NULL)
        return(NULL);

    html = htmlFindFirstChild((htmlNodePtr) doc, "html");
    if (html == NULL)
        return(NULL);

    return(htmlFindFirstChild(html, "head"));
}

int
htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) {
    const xmlChar *p = val;

    while (1) {
        size_t start, end;

        while ((*p != 'c') && (*p != 'C')) {
            if (*p == 0)
                return(0);
            p += 1;
        }
        p += 1;

        if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0)
            continue;

        p += 6;
        while (IS_WS_HTML(*p)) p += 1;

        if (*p != '=')
            continue;

        p += 1;
        while (IS_WS_HTML(*p)) p += 1;

        if (*p == 0)
            return(0);

        if ((*p == '"') || (*p == '\'')) {
            int quote = *p;

            p += 1;
            while (IS_WS_HTML(*p)) p += 1;

            start = p - val;
            end = start;

            while (*p != quote) {
                if (*p == 0)
                    return(0);
                if (!IS_WS_HTML(*p))
                    end = p + 1 - val;
                p += 1;
            }
        } else {
            start = p - val;

            while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p)))
                p += 1;

            end = p - val;
        }

        off->start = start;
        off->end = end;
        off->size = p - val + strlen((char *) p);

        return(1);
    }

    return(0);
}

static xmlAttrPtr
htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) {
    xmlAttrPtr attr, contentAttr = NULL;
    int isContentType = 0;

    if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)
        return(NULL);

    for (attr = elem->properties; attr != NULL; attr = attr->next) {
        if (attr->ns != NULL)
            continue;
        if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) {
            *outIsContentType = 0;
            return(attr);
        }
        if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0)
            contentAttr = attr;
        if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) &&
            (attr->children != NULL) &&
            (attr->children->type == XML_TEXT_NODE) &&
            (attr->children->next == NULL) &&
            (xmlStrcasecmp(attr->children->content,
                           BAD_CAST "Content-Type") == 0))
            isContentType = 1;
    }

    if ((isContentType) && (contentAttr != NULL)) {
        *outIsContentType = 1;
        return(contentAttr);
    }

    return(NULL);
}

static int
htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) {
    xmlAttrPtr attr;
    const xmlChar *val = NULL;
    int isContentType;

    if ((elem->type != XML_ELEMENT_NODE) ||
        (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0))
        return(0);

    attr = htmlFindMetaEncodingAttr(elem, &isContentType);
    if (attr == NULL)
        return(0);

    if ((attr->children != NULL) &&
        (attr->children->type == XML_TEXT_NODE) &&
        (attr->children->next == NULL) &&
        (attr->children->content != NULL))
        val = attr->children->content;
    else
        val = BAD_CAST "";


    if (!isContentType) {
        size_t size = strlen((char *) val);
        size_t start = 0;
        size_t end = size;

        while ((start < size) && (IS_WS_HTML(val[start])))
            start += 1;

        while ((end > 0) && (IS_WS_HTML(val[end-1])))
            end -= 1;

        menc->attr = attr;
        menc->attrValue = val;
        menc->off.start = start;
        menc->off.end = end;
        menc->off.size = size;

        return(1);
    } else {
        if (htmlParseContentType(val, &menc->off)) {
            menc->attr = attr;
            menc->attrValue = val;

            return(1);
        }
    }

    return(0);
}

static xmlChar *
htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) {
    xmlChar *newVal, *p;
    size_t size, oldEncSize, newEncSize;

    /*
     * The pseudo "HTML" encoding only produces ASCII.
     */
    if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0)
        encoding = "ASCII";

    oldEncSize = menc->off.end - menc->off.start;
    newEncSize = strlen((char *) encoding);
    size = menc->off.size - oldEncSize + newEncSize;
    newVal = xmlMalloc(size + 1);
    if (newVal == NULL)
        return(NULL);

    p = newVal;
    memcpy(p, menc->attrValue, menc->off.start);
    p += menc->off.start;
    memcpy(p, encoding, newEncSize);
    p += newEncSize;
    memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end);
    newVal[size] = 0;

    return(newVal);
}

/**
 * Look up and encoding declaration in the meta tags.
 *
 * The returned string points into attribute content and can contain
 * trailing garbage. It should be copied before modifying or freeing
 * nodes.
 *
 * @param doc  the document
 * @returns the encoding ot NULL if not found.
 */
const xmlChar *
htmlGetMetaEncoding(xmlDoc *doc) {
    htmlNodePtr head, node;

    head = htmlFindHead(doc);
    if (head == NULL)
        return(NULL);

    for (node = head->children; node != NULL; node = node->next) {
        htmlMetaEncoding menc;

        if (htmlParseMetaEncoding(node, &menc)) {
            /*
             * Returning a `const xmlChar *` only allows to return
             * a suffix. In http-equiv meta tags, there could be
             * more data after the charset, although it's probably
             * rare in practice.
             */
            return(menc.attrValue + menc.off.start);
        }
    }

    return(NULL);
}

/**
 * Creates or updates a meta tag with an encoding declaration.
 *
 * NOTE: This will not change the document content encoding.
 *
 * @param doc  the document
 * @param encoding  the encoding string
 * @returns 0 in case of success, 1 if no head element was found or
 * arguments are invalid and -1 if memory allocation failed.
 */
int
htmlSetMetaEncoding(xmlDoc *doc, const xmlChar *encoding) {
    htmlNodePtr head, meta;
    int found = 0;

    if (encoding == NULL)
        return(1);

    head = htmlFindHead(doc);
    if (head == NULL)
        return(1);

    for (meta = head->children; meta != NULL; meta = meta->next) {
        htmlMetaEncoding menc;

        if (htmlParseMetaEncoding(meta, &menc)) {
            xmlChar *newVal;
            int ret;

            found = 1;

            newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding);
            if (newVal == NULL)
                return(-1);
            xmlNodeSetContent((xmlNodePtr) menc.attr, NULL);
            ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal);
            xmlFree(newVal);

            if (ret < 0)
                return(-1);
        }
    }

    if (found)
        return(0);

    meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL);
    if (meta == NULL)
        return(-1);

    if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) {
        xmlFreeNode(meta);
        return(-1);
    }

    if (head->children == NULL)
        xmlAddChild(head, meta);
    else
        xmlAddPrevSibling(head->children, meta);

    return(0);
}

/**
 * Determine if a given attribute is a boolean attribute. This
 * doesn't handle HTML5.
 *
 * @deprecated Internal function, don't use.
 *
 * @param name  the name of the attribute to check
 * @returns false if the attribute is not boolean, true otherwise.
 */
int
htmlIsBooleanAttr(const xmlChar *name)
{
    const char *str = NULL;

    if (name == NULL)
        return(0);

    /*
     * These are the HTML attributes which will be output
     * in minimized form, i.e. `<option selected="selected">` will be
     * output as `<option selected>`, as per XSLT 1.0 16.2 "HTML Output
     * Method":
     *
     * "checked", "compact", "declare", "defer", "disabled", "ismap",
     * "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
     * "selected"
     *
     * Additional attributes from HTML5 (not implemented yet):
     *
     * "allowfullscreen", "alpha", "async", "autofocus", "autoplay",
     * "controls", "default", "formnovalidate", "inert", "itemscope",
     * "loop", "muted", "nomodule", "novalidate", "open", "playsinline",
     * "required", "reversed", "shadowrootdelegatesfocus",
     * "shadowrootclonable", "shadowrootserializable",
     * "shadowrootcustomelementregistry", "truespeed"
     */

    switch (name[0] | 0x20) {
        case 'c':
            name += 1;
            switch (name[0] | 0x20) {
                case 'h': str = "ecked"; break;
                case 'o': str = "mpact"; break;
            }
            break;
        case 'd':
            name += 1;
            switch (name[0] | 0x20) {
                case 'e':
                    name += 1;
                    switch (name[0] | 0x20) {
                        case 'c': str = "lare"; break;
                        case 'f': str = "er"; break;
                    }
                    break;
                case 'i': str = "sabled"; break;
            }
            break;
        case 'i':
            str = "smap";
            break;
        case 'm':
            str = "ultiple";
            break;
        case 'n':
            name += 1;
            if ((name[0] | 0x20) != 'o')
                break;
            name += 1;
            switch (name[0] | 0x20) {
                case 'h': str = "ref"; break;
                case 'r': str = "esize"; break;
                case 's': str = "hade"; break;
                case 'w': str = "rap"; break;
            }
            break;
        case 'r':
            str = "eadonly";
            break;
        case 's':
            str = "elected";
            break;
    }

    if (str == NULL)
        return(0);

    return(xmlStrcasecmp(name + 1, BAD_CAST str) == 0);
}

#ifdef LIBXML_OUTPUT_ENABLED
/************************************************************************
 *									*
 *		Dumping HTML tree content to a simple buffer		*
 *									*
 ************************************************************************/

static xmlParserErrors
htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) {
    /*
     * Fallback to HTML if the encoding is unspecified
     */
    if (encoding == NULL)
        encoding = "HTML";

    return(xmlOpenCharEncodingHandler(encoding, /* output */ 1, out));
}

/**
 * Serialize an HTML document to an xmlBuf.
 *
 * @param buf  the xmlBuf output
 * @param doc  the document (unused)
 * @param cur  the current node
 * @param format  should formatting newlines been added
 * @returns the number of bytes written or -1 in case of error
 */
static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc ATTRIBUTE_UNUSED,
                      xmlNodePtr cur, int format) {
    size_t use;
    size_t ret;
    xmlOutputBufferPtr outbuf;

    if (cur == NULL) {
	return ((size_t) -1);
    }
    if (buf == NULL) {
	return ((size_t) -1);
    }
    outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
    if (outbuf == NULL)
	return ((size_t) -1);
    memset(outbuf, 0, sizeof(xmlOutputBuffer));
    outbuf->buffer = buf;
    outbuf->encoder = NULL;
    outbuf->writecallback = NULL;
    outbuf->closecallback = NULL;
    outbuf->context = NULL;
    outbuf->written = 0;

    use = xmlBufUse(buf);
    htmlNodeDumpInternal(outbuf, cur, NULL, format);
    if (outbuf->error)
        ret = (size_t) -1;
    else
        ret = xmlBufUse(buf) - use;
    xmlFree(outbuf);
    return (ret);
}

/**
 * Serialize an HTML node to an xmlBuffer. Always uses UTF-8.
 *
 * @param buf  the HTML buffer output
 * @param doc  the document
 * @param cur  the current node
 * @returns the number of bytes written or -1 in case of error
 */
int
htmlNodeDump(xmlBuffer *buf, xmlDoc *doc, xmlNode *cur) {
    xmlBufPtr buffer;
    size_t ret1;
    int ret2;

    if ((buf == NULL) || (cur == NULL))
        return(-1);

    xmlInitParser();
    buffer = xmlBufFromBuffer(buf);
    if (buffer == NULL)
        return(-1);

    ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);

    ret2 = xmlBufBackToBuffer(buffer, buf);

    if ((ret1 == (size_t) -1) || (ret2 < 0))
        return(-1);
    return(ret1 > INT_MAX ? INT_MAX : ret1);
}

/**
 * Serialize an HTML node to an xmlBuffer.
 *
 * If encoding is NULL, ASCII with HTML 4.0 named character entities
 * will be used. This is inefficient compared to UTF-8 and might be
 * changed in a future version.
 *
 * @param out  the FILE pointer
 * @param doc  the document (unused)
 * @param cur  the current node
 * @param encoding  the document encoding (optional)
 * @param format  should formatting newlines been added
 * @returns the number of bytes written or -1 in case of failure.
 */
int
htmlNodeDumpFileFormat(FILE *out, xmlDoc *doc ATTRIBUTE_UNUSED,
	               xmlNode *cur, const char *encoding, int format) {
    xmlOutputBufferPtr buf;
    xmlCharEncodingHandlerPtr handler;
    int ret;

    xmlInitParser();

    /*
     * save the content to a temp buffer.
     */
    if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
        return(-1);
    buf = xmlOutputBufferCreateFile(out, handler);
    if (buf == NULL) {
        xmlCharEncCloseFunc(handler);
        return(-1);
    }

    htmlNodeDumpInternal(buf, cur, NULL, format);

    ret = xmlOutputBufferClose(buf);
    return(ret);
}

/**
 * Same as #htmlNodeDumpFileFormat with `format` set to 1 which is
 * typically undesired. Use of this function is DISCOURAGED in favor
 * of #htmlNodeDumpFileFormat.
 *
 * @param out  the FILE pointer
 * @param doc  the document
 * @param cur  the current node
 */
void
htmlNodeDumpFile(FILE *out, xmlDoc *doc, xmlNode *cur) {
    htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
}

/**
 * Serialize an HTML node to a memory, also returning the size of
 * the result. It's up to the caller to free the memory.
 *
 * Uses the encoding of the document. If the document has no
 * encoding, ASCII with HTML 4.0 named character entities will
 * be used. This is inefficient compared to UTF-8 and might be
 * changed in a future version.
 *
 * @param cur  the document
 * @param mem  OUT: the memory pointer
 * @param size  OUT: the memory length
 * @param format  should formatting newlines been added
 */
void
htmlDocDumpMemoryFormat(xmlDoc *cur, xmlChar**mem, int *size, int format) {
    xmlOutputBufferPtr buf;
    xmlCharEncodingHandlerPtr handler = NULL;

    xmlInitParser();

    if ((mem == NULL) || (size == NULL))
        return;
    *mem = NULL;
    *size = 0;
    if (cur == NULL)
	return;

    if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
        return;
    buf = xmlAllocOutputBuffer(handler);
    if (buf == NULL) {
        xmlCharEncCloseFunc(handler);
	return;
    }

    htmlDocContentDumpFormatOutput(buf, cur, NULL, format);

    xmlOutputBufferFlush(buf);

    if (!buf->error) {
        if (buf->conv != NULL) {
            *size = xmlBufUse(buf->conv);
            *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
        } else {
            *size = xmlBufUse(buf->buffer);
            *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
        }
    }

    xmlOutputBufferClose(buf);
}

/**
 * Same as #htmlDocDumpMemoryFormat with `format` set to 1 which
 * is typically undesired. Also see the warnings there. Use of
 * this function is DISCOURAGED in favor of
 * #htmlDocContentDumpFormatOutput.
 *
 * @param cur  the document
 * @param mem  OUT: the memory pointer
 * @param size  OUT: the memory length
 */
void
htmlDocDumpMemory(xmlDoc *cur, xmlChar**mem, int *size) {
    htmlDocDumpMemoryFormat(cur, mem, size, 1);
}


/************************************************************************
 *									*
 *		Dumping HTML tree content to an I/O output buffer	*
 *									*
 ************************************************************************/

/**
 * Serialize the HTML document's DTD, if any.
 *
 * Ignores `encoding` and uses the encoding of the output buffer.
 *
 * @param buf  the HTML buffer output
 * @param doc  the document
 * @param encoding  the encoding string (unused)
 */
static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
	          const char *encoding ATTRIBUTE_UNUSED) {
    xmlDtdPtr cur = doc->intSubset;

    if (cur == NULL)
	return;
    xmlOutputBufferWrite(buf, 10, "<!DOCTYPE ");
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
    if (cur->ExternalID != NULL) {
	xmlOutputBufferWrite(buf, 8, " PUBLIC ");
	xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
	if (cur->SystemID != NULL) {
	    xmlOutputBufferWrite(buf, 1, " ");
	    xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
	}
    } else if (cur->SystemID != NULL &&
	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
	xmlOutputBufferWrite(buf, 8, " SYSTEM ");
	xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
    }
    xmlOutputBufferWrite(buf, 2, ">\n");
}

static void
htmlSerializeUri(xmlOutputBufferPtr buf, const xmlChar *content) {
    const xmlChar *tmp = content;

    /*
     * See appendix "B.2.1 Non-ASCII characters in URI attribute
     * values" in the HTML 4.01 spec. This is also recommended
     * by the HTML output method of the XSLT 1.0 spec.
     *
     * We also escape space and control chars.
     */

    /* Skip over initial whitespace */
    while (IS_WS_HTML(*tmp)) tmp++;
    if (tmp > content) {
        xmlOutputBufferWrite(buf, tmp - content, (char *) content);
        content = tmp;
    }

    while (1) {
        char escbuf[3];
        const char *repl;
        int replSize;
        int c = *tmp;

        while ((c > 0x20) && (c < 0x7F) && (c != '"') && (c != '&')) {
            tmp += 1;
            c = *tmp;
        }

        if (tmp > content)
            xmlOutputBufferWrite(buf, tmp - content, (char *) content);

        if ((c <= 0x20) || (c >= 0x7F)) {
            static const char hex[16] = {
                '0', '1', '2', '3', '4', '5', '6', '7',
                '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
            };

            if (c == 0)
                break;

            escbuf[0] = '%';
            escbuf[1] = hex[(c >> 4) & 0x0F];
            escbuf[2] = hex[c & 0x0F];
            repl = escbuf;
            replSize = 3;
        } else if (c == '"') {
            repl = "&quot;";
            replSize = 6;
        } else {
            repl = "&amp;";
            replSize = 5;
        }

        xmlOutputBufferWrite(buf, replSize, repl);
        tmp += 1;
        content = tmp;
    }
}

/**
 * Serialize an HTML attribute.
 *
 * @param buf  the HTML buffer output
 * @param cur  the attribute pointer
 */
static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlAttrPtr cur) {
    xmlOutputBufferWrite(buf, 1, " ");

    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
        xmlOutputBufferWrite(buf, 1, ":");
    }
    xmlOutputBufferWriteString(buf, (const char *)cur->name);

    /*
     * The HTML5 spec requires to always serialize empty attribute
     * values as `=""`. We should probably align with HTML5 at some
     * point.
     */
    if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
        xmlNodePtr child;
        int isUri;

        xmlOutputBufferWrite(buf, 2, "=\"");

        /*
         * Special handling of URIs doesn't conform to HTML5 and
         * should probably be removed at some point.
         */
        isUri = (cur->ns == NULL) && (cur->parent != NULL) &&
                (cur->parent->ns == NULL) &&
                ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
                 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
                 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
                 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
                  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))));

        for (child = cur->children; child != NULL; child = child->next) {
            if (child->type == XML_TEXT_NODE) {
                const xmlChar *content = child->content;

                if (content == NULL)
                    continue;

                if (isUri) {
                    htmlSerializeUri(buf, content);
                } else {
                    xmlSerializeText(buf, content, SIZE_MAX,
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                }
            } else if (child->type == XML_ENTITY_REF_NODE) {
                /* TODO: We should probably expand entity refs */
                xmlOutputBufferWrite(buf, 1, "&");
                xmlOutputBufferWriteString(buf, (char *) child->name);
                xmlOutputBufferWrite(buf, 1, ";");
            }
        }

        xmlOutputBufferWrite(buf, 1, "\"");
    }
}

/**
 * Serialize an HTML node to an output buffer.
 *
 * If `encoding` is specified, it is used to create or update meta
 * tags containing the character encoding.
 *
 * @param buf  the HTML buffer output
 * @param cur  the current node
 * @param encoding  the encoding string (optional)
 * @param format  should formatting newlines been added
 */
void
htmlNodeDumpInternal(xmlOutputBuffer *buf, xmlNode *cur,
                     const char *encoding, int format) {
    xmlNodePtr root, parent, metaHead = NULL;
    xmlAttrPtr attr;
    const htmlElemDesc * info;
    int isRaw = 0;

    xmlInitParser();

    if ((cur == NULL) || (buf == NULL)) {
	return;
    }

    root = cur;
    parent = cur->parent;
    while (1) {
        switch (cur->type) {
        case XML_HTML_DOCUMENT_NODE:
        case XML_DOCUMENT_NODE:
            if (((xmlDocPtr) cur)->intSubset != NULL) {
                htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
            }
            if (cur->children != NULL) {
                /* Always validate cur->parent when descending. */
                if (cur->parent == parent) {
                    parent = cur;
                    cur = cur->children;
                    continue;
                }
            } else {
                xmlOutputBufferWrite(buf, 1, "\n");
            }
            break;

        case XML_ELEMENT_NODE: {
            htmlMetaEncoding menc;
            int isMeta = 0;
            int addMeta = 0;

            /*
             * Some users like lxml are known to pass nodes with a corrupted
             * tree structure. Fall back to a recursive call to handle this
             * case.
             */
            if ((cur->parent != parent) && (cur->children != NULL)) {
                htmlNodeDumpInternal(buf, cur, encoding, format);
                break;
            }

            /*
             * Get specific HTML info for that node.
             */
            if (cur->ns == NULL)
                info = htmlTagLookup(cur->name);
            else
                info = NULL;

            if (encoding != NULL) {
                isMeta = htmlParseMetaEncoding(cur, &menc);

                /*
                 * Don't add meta tag for "HTML" encoding.
                 */
                if ((xmlStrcasecmp(BAD_CAST encoding,
                                   BAD_CAST "HTML") != 0) &&
                    (xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) &&
                    (parent != NULL) &&
                    (xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) &&
                    (parent->parent != NULL) &&
                    (parent->parent->parent == NULL) &&
                    (metaHead == NULL)) {
                    xmlNodePtr n;

                    metaHead = cur;
                    addMeta = 1;

                    for (n = cur->children; n != NULL; n = n->next) {
                        int unused;

                        if (htmlFindMetaEncodingAttr(n, &unused) != NULL) {
                            metaHead = NULL;
                            addMeta = 0;
                            break;
                        }
                    }
                }
            }

            xmlOutputBufferWrite(buf, 1, "<");
            if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
                xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
                xmlOutputBufferWrite(buf, 1, ":");
            }
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
            if (cur->nsDef)
                xmlNsListDumpOutput(buf, cur->nsDef);
            attr = cur->properties;
            while (attr != NULL) {
                if ((!isMeta) || (attr != menc.attr)) {
                    htmlAttrDumpOutput(buf, attr);
                } else {
                    xmlOutputBufferWrite(buf, 1, " ");
                    xmlOutputBufferWriteString(buf, (char *) attr->name);

                    xmlOutputBufferWrite(buf, 2, "=\"");
                    xmlSerializeText(buf, menc.attrValue, menc.off.start,
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                    xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                    xmlSerializeText(buf, menc.attrValue + menc.off.end,
                                     menc.off.size - menc.off.end,
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                    xmlOutputBufferWrite(buf, 1, "\"");
                }
                attr = attr->next;
            }

            if ((info != NULL) && (info->empty)) {
                xmlOutputBufferWrite(buf, 1, ">");
            } else if (cur->children == NULL) {
                if (addMeta) {
                    xmlOutputBufferWrite(buf, 16, "><meta charset=\"");
                    xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                    xmlOutputBufferWrite(buf, 4, "\"></");
                } else {
                    xmlOutputBufferWrite(buf, 3, "></");
                }
                if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
                    xmlOutputBufferWriteString(buf,
                            (const char *)cur->ns->prefix);
                    xmlOutputBufferWrite(buf, 1, ":");
                }
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
                xmlOutputBufferWrite(buf, 1, ">");
            } else {
                xmlOutputBufferWrite(buf, 1, ">");
                if ((format) &&
                    ((addMeta) ||
                     ((info != NULL) && (!info->isinline) &&
                      (cur->children->type != HTML_TEXT_NODE) &&
                      (cur->children->type != HTML_ENTITY_REF_NODE) &&
                      (cur->children != cur->last) &&
                      (cur->name != NULL) &&
                      (cur->name[0] != 'p')))) /* p, pre, param */
                    xmlOutputBufferWrite(buf, 1, "\n");
                if (addMeta) {
                    xmlOutputBufferWrite(buf, 15, "<meta charset=\"");
                    xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                    xmlOutputBufferWrite(buf, 2, "\">");
                    if ((format) &&
                        (cur->children->type != HTML_TEXT_NODE) &&
                        (cur->children->type != HTML_ENTITY_REF_NODE))
                        xmlOutputBufferWrite(buf, 1, "\n");
                }

                if ((info != NULL) && (info->dataMode >= DATA_RAWTEXT))
                    isRaw = 1;

                parent = cur;
                cur = cur->children;
                continue;
            }

            if ((format) && (cur->next != NULL) &&
                (info != NULL) && (!info->isinline)) {
                if ((cur->next->type != HTML_TEXT_NODE) &&
                    (cur->next->type != HTML_ENTITY_REF_NODE) &&
                    (parent != NULL) &&
                    (parent->name != NULL) &&
                    (parent->name[0] != 'p')) /* p, pre, param */
                    xmlOutputBufferWrite(buf, 1, "\n");
            }

            break;
        }

        case XML_ATTRIBUTE_NODE:
            htmlAttrDumpOutput(buf, (xmlAttrPtr) cur);
            break;

        case HTML_TEXT_NODE:
            if (cur->content == NULL)
                break;
            if ((cur->name == (const xmlChar *)xmlStringTextNoenc) ||
                (isRaw)) {
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
            } else {
                xmlSerializeText(buf, cur->content, SIZE_MAX, XML_ESCAPE_HTML);
            }
            break;

        case HTML_COMMENT_NODE:
            if (cur->content != NULL) {
                xmlOutputBufferWrite(buf, 4, "<!--");
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
                xmlOutputBufferWrite(buf, 3, "-->");
            }
            break;

        case HTML_PI_NODE:
            if (cur->name != NULL) {
                xmlOutputBufferWrite(buf, 2, "<?");
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
                if (cur->content != NULL) {
                    xmlOutputBufferWrite(buf, 1, " ");
                    xmlOutputBufferWriteString(buf,
                            (const char *)cur->content);
                }
                xmlOutputBufferWrite(buf, 1, ">");
            }
            break;

        case HTML_ENTITY_REF_NODE:
            xmlOutputBufferWrite(buf, 1, "&");
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
            xmlOutputBufferWrite(buf, 1, ";");
            break;

        case HTML_PRESERVE_NODE:
            if (cur->content != NULL) {
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
            }
            break;

        default:
            break;
        }

        while (1) {
            if (cur == root)
                return;
            if (cur->next != NULL) {
                cur = cur->next;
                break;
            }

            isRaw = 0;

            cur = parent;
            /* cur->parent was validated when descending. */
            parent = cur->parent;

            if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
                (cur->type == XML_DOCUMENT_NODE)) {
                xmlOutputBufferWrite(buf, 1, "\n");
            } else {
                if ((format) && (cur->ns == NULL))
                    info = htmlTagLookup(cur->name);
                else
                    info = NULL;

                if ((format) && (info != NULL) && (!info->isinline) &&
                    (cur->last->type != HTML_TEXT_NODE) &&
                    (cur->last->type != HTML_ENTITY_REF_NODE) &&
                    ((cur->children != cur->last) || (cur == metaHead)) &&
                    (cur->name != NULL) &&
                    (cur->name[0] != 'p')) /* p, pre, param */
                    xmlOutputBufferWrite(buf, 1, "\n");

                xmlOutputBufferWrite(buf, 2, "</");
                if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
                    xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
                    xmlOutputBufferWrite(buf, 1, ":");
                }
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
                xmlOutputBufferWrite(buf, 1, ">");

                if ((format) && (info != NULL) && (!info->isinline) &&
                    (cur->next != NULL)) {
                    if ((cur->next->type != HTML_TEXT_NODE) &&
                        (cur->next->type != HTML_ENTITY_REF_NODE) &&
                        (parent != NULL) &&
                        (parent->name != NULL) &&
                        (parent->name[0] != 'p')) /* p, pre, param */
                        xmlOutputBufferWrite(buf, 1, "\n");
                }

                if (cur == metaHead)
                    metaHead = NULL;
            }
        }
    }
}

/**
 * Serialize an HTML node to an output buffer.
 *
 * @param buf  the HTML buffer output
 * @param doc  the document (unused)
 * @param cur  the current node
 * @param encoding  the encoding string (unused)
 * @param format  should formatting newlines been added
 */
void
htmlNodeDumpFormatOutput(xmlOutputBuffer *buf,
                         xmlDoc *doc ATTRIBUTE_UNUSED, xmlNode *cur,
                         const char *encoding ATTRIBUTE_UNUSED, int format) {
    htmlNodeDumpInternal(buf, cur, NULL, format);
}

/**
 * Same as #htmlNodeDumpFormatOutput with `format` set to 1 which is
 * typically undesired. Use of this function is DISCOURAGED in favor
 * of #htmlNodeDumpFormatOutput.
 *
 * @param buf  the HTML buffer output
 * @param doc  the document (unused)
 * @param cur  the current node
 * @param encoding  the encoding string (unused)
 */
void
htmlNodeDumpOutput(xmlOutputBuffer *buf, xmlDoc *doc ATTRIBUTE_UNUSED,
                   xmlNode *cur, const char *encoding ATTRIBUTE_UNUSED) {
    htmlNodeDumpInternal(buf, cur, NULL, 1);
}

/**
 * Serialize an HTML document to an output buffer.
 *
 * @param buf  the HTML buffer output
 * @param cur  the document
 * @param encoding  the encoding string (unused)
 * @param format  should formatting newlines been added
 */
void
htmlDocContentDumpFormatOutput(xmlOutputBuffer *buf, xmlDoc *cur,
	                       const char *encoding ATTRIBUTE_UNUSED,
                               int format) {
    htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, format);
}

/**
 * Same as #htmlDocContentDumpFormatOutput with `format` set to 1
 * which is typically undesired. Use of this function is DISCOURAGED
 * in favor of #htmlDocContentDumpFormatOutput.
 *
 * @param buf  the HTML buffer output
 * @param cur  the document
 * @param encoding  the encoding string (unused)
 */
void
htmlDocContentDumpOutput(xmlOutputBuffer *buf, xmlDoc *cur,
	                 const char *encoding ATTRIBUTE_UNUSED) {
    htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, 1);
}

/************************************************************************
 *									*
 *		Saving functions front-ends				*
 *									*
 ************************************************************************/

/**
 * Serialize an HTML document to an open `FILE`.
 *
 * Uses the encoding of the document. If the document has no
 * encoding, ASCII with HTML 4.0 named character entities will
 * be used. This is inefficient compared to UTF-8 and might be
 * changed in a future version.
 *
 * Enables "formatting" unconditionally which is typically
 * undesired.
 *
 * Use of this function is DISCOURAGED in favor of
 * #htmlNodeDumpFileFormat.
 *
 * @param f  the FILE*
 * @param cur  the document
 * @returns the number of bytes written or -1 in case of failure.
 */
int
htmlDocDump(FILE *f, xmlDoc *cur) {
    xmlOutputBufferPtr buf;
    xmlCharEncodingHandlerPtr handler = NULL;
    int ret;

    xmlInitParser();

    if ((cur == NULL) || (f == NULL)) {
	return(-1);
    }

    if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
        return(-1);
    buf = xmlOutputBufferCreateFile(f, handler);
    if (buf == NULL) {
        xmlCharEncCloseFunc(handler);
        return(-1);
    }
    htmlDocContentDumpOutput(buf, cur, NULL);

    ret = xmlOutputBufferClose(buf);
    return(ret);
}

/**
 * Serialize an HTML document to a file.
 *
 * Same as #htmlSaveFileFormat with `encoding` set to NULL and
 * `format` set to 1 which is typically undesired.
 *
 * Use of this function is DISCOURAGED in favor of
 * #htmlSaveFileFormat.
 *
 * @param filename  the filename (or URL)
 * @param cur  the document
 * @returns the number of bytes written or -1 in case of failure.
 */
int
htmlSaveFile(const char *filename, xmlDoc *cur) {
    return(htmlSaveFileFormat(filename, cur, NULL, 1));
}

/**
 * Serialize an HTML document to a file using a given encoding.
 *
 * If `filename` is `"-"`, stdout is used. This is potentially
 * insecure and might be changed in a future version.
 *
 * If encoding is NULL, ASCII with HTML 4.0 named character entities
 * will be used. This is inefficient compared to UTF-8 and might be
 * changed in a future version.
 *
 * Sets or updates meta tags containing the character encoding.
 *
 * @param filename  the filename
 * @param cur  the document
 * @param format  should formatting newlines been added
 * @param encoding  the document encoding (optional)
 * @returns the number of bytes written or -1 in case of failure.
 */
int
htmlSaveFileFormat(const char *filename, xmlDoc *cur,
	           const char *encoding, int format) {
    xmlOutputBufferPtr buf;
    xmlCharEncodingHandlerPtr handler = NULL;
    int ret;

    if ((cur == NULL) || (filename == NULL))
        return(-1);

    xmlInitParser();

    if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
        return(-1);

    /*
     * save the content to a temp buffer.
     */
    buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
    if (buf == NULL) {
        xmlCharEncCloseFunc(handler);
        return(0);
    }

    htmlDocContentDumpFormatOutput(buf, cur, encoding, format);

    ret = xmlOutputBufferClose(buf);
    return(ret);
}

/**
 * Serialize an HTML document to a file.
 *
 * Same as #htmlSaveFileFormat with `format` set to 1 which is
 * typically undesired. Also see the warnings there. Use of this
 * function is DISCOURAGED in favor of #htmlSaveFileFormat.
 *
 * @param filename  the filename
 * @param cur  the document
 * @param encoding  the document encoding
 * @returns the number of bytes written or -1 in case of failure.
 */
int
htmlSaveFileEnc(const char *filename, xmlDoc *cur, const char *encoding) {
    return(htmlSaveFileFormat(filename, cur, encoding, 1));
}

#endif /* LIBXML_OUTPUT_ENABLED */

#endif /* LIBXML_HTML_ENABLED */
kc3-lang/libxml2/HTMLtree.c

Commit

kc3-lang/libxml2 /HTMLtree.c