kmx git

/*
 * icu.c: Example how to use ICU for character encoding conversion
 *
 * This example shows how to use ICU by installing a custom character
 * encoding converter with xmlCtxtSetCharEncConvImpl, available
 * since libxml2 2.14.
 *
 * This approach makes it possible to use ICU even if libxml2 is
 * compiled without ICU support. It also makes sure that *only* ICU
 * is used. Many Linux distros currently ship libxml2 with support
 * for both ICU and iconv which makes the library's behavior hard to
 * predict.
 *
 * The long-term plan is to make libxml2 only support a single
 * conversion library internally (iconv on POSIX).
 */

#include <stdio.h>
#include <libxml/parser.h>
#include <unicode/ucnv.h>

#define ICU_PIVOT_BUF_SIZE 1024

typedef struct {
    UConverter *uconv; /* for conversion between an encoding and UTF-16 */
    UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
    UChar      *pivot_source;
    UChar      *pivot_target;
    int        isInput;
    UChar      pivot_buf[ICU_PIVOT_BUF_SIZE];
} myConvCtxt;

static xmlCharEncError
icuConvert(void *vctxt, unsigned char *out, int *outlen,
           const unsigned char *in, int *inlen, int flush) {
    myConvCtxt *cd = vctxt;
    const char *ucv_in = (const char *) in;
    char *ucv_out = (char *) out;
    UConverter *target, *source;
    UErrorCode err = U_ZERO_ERROR;
    int ret;

    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
        if (outlen != NULL)
            *outlen = 0;
        return XML_ENC_ERR_INTERNAL;
    }

    /*
     * The ICU API can consume input, including partial sequences,
     * even if the output buffer would overflow. The remaining input
     * must be processed by calling ucnv_convertEx with a possibly
     * empty input buffer.
     */
    if (cd->isInput) {
        source = cd->uconv;
        target = cd->utf8;
    } else {
        source = cd->utf8;
        target = cd->uconv;
    }

    ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
                   &ucv_in, ucv_in + *inlen, cd->pivot_buf,
                   &cd->pivot_source, &cd->pivot_target,
                   cd->pivot_buf + ICU_PIVOT_BUF_SIZE,
                   /* reset */ 0, flush, &err);

    *inlen = ucv_in - (const char*) in;
    *outlen = ucv_out - (char *) out;

    if (U_SUCCESS(err)) {
        ret = XML_ENC_ERR_SUCCESS;
    } else {
        switch (err) {
            case U_TRUNCATED_CHAR_FOUND:
                /* Should only happen with flush */
                ret = XML_ENC_ERR_INPUT;
                break;

            case U_BUFFER_OVERFLOW_ERROR:
                ret = XML_ENC_ERR_SPACE;
                break;

            case U_INVALID_CHAR_FOUND:
            case U_ILLEGAL_CHAR_FOUND:
            case U_ILLEGAL_ESCAPE_SEQUENCE:
            case U_UNSUPPORTED_ESCAPE_SEQUENCE:
                ret = XML_ENC_ERR_INPUT;
                break;

            case U_MEMORY_ALLOCATION_ERROR:
                ret = XML_ENC_ERR_MEMORY;
                break;

            default:
                ret = XML_ENC_ERR_INTERNAL;
                break;
        }
    }

    return ret;
}

static xmlParserErrors
icuOpen(const char* name, int isInput, myConvCtxt **out)
{
    UErrorCode status;
    myConvCtxt *cd;

    *out = NULL;

    cd = xmlMalloc(sizeof(myConvCtxt));
    if (cd == NULL)
        return XML_ERR_NO_MEMORY;

    cd->isInput = isInput;
    cd->pivot_source = cd->pivot_buf;
    cd->pivot_target = cd->pivot_buf;

    status = U_ZERO_ERROR;
    cd->uconv = ucnv_open(name, &status);
    if (U_FAILURE(status))
        goto error;

    status = U_ZERO_ERROR;
    if (isInput) {
        ucnv_setToUCallBack(cd->uconv, UCNV_TO_U_CALLBACK_STOP,
                            NULL, NULL, NULL, &status);
    }
    else {
        ucnv_setFromUCallBack(cd->uconv, UCNV_FROM_U_CALLBACK_STOP,
                              NULL, NULL, NULL, &status);
    }
    if (U_FAILURE(status))
        goto error;

    status = U_ZERO_ERROR;
    cd->utf8 = ucnv_open("UTF-8", &status);
    if (U_FAILURE(status))
        goto error;

    *out = cd;
    return 0;

error:
    if (cd->uconv)
        ucnv_close(cd->uconv);
    xmlFree(cd);

    if (status == U_FILE_ACCESS_ERROR)
        return XML_ERR_UNSUPPORTED_ENCODING;
    if (status == U_MEMORY_ALLOCATION_ERROR)
        return XML_ERR_NO_MEMORY;
    return XML_ERR_SYSTEM;
}

static void
icuClose(myConvCtxt *cd)
{
    if (cd == NULL)
        return;
    ucnv_close(cd->uconv);
    ucnv_close(cd->utf8);
    xmlFree(cd);
}

static void
icuConvCtxtDtor(void *vctxt) {
    icuClose(vctxt);
}

static xmlParserErrors
icuConvImpl(void *vctxt, const char *name, xmlCharEncFlags flags,
            xmlCharEncodingHandler **result) {
    xmlCharEncConvFunc inFunc = NULL, outFunc = NULL;
    myConvCtxt *inputCtxt = NULL;
    myConvCtxt *outputCtxt = NULL;
    xmlParserErrors ret;

    if (flags & XML_ENC_INPUT) {
        ret = icuOpen(name, 1, &inputCtxt);
        if (ret != 0)
            goto error;
        inFunc = icuConvert;
    }

    if (flags & XML_ENC_OUTPUT) {
        ret = icuOpen(name, 0, &outputCtxt);
        if (ret != 0)
            goto error;
        outFunc = icuConvert;
    }

    return xmlCharEncNewCustomHandler(name, inFunc, outFunc, icuConvCtxtDtor,
                                      inputCtxt, outputCtxt, result);

error:
    if (inputCtxt != NULL)
        icuClose(inputCtxt);
    if (outputCtxt != NULL)
        icuClose(outputCtxt);
    return ret;
}

int
main(void) {
    xmlParserCtxtPtr ctxt;
    xmlDocPtr doc;
    const char *xml;
    xmlChar *content;
    int ret = 0;

    /*
     * We use IBM-1051, an alias for HP Roman, as a simple example that
     * ICU supports, but iconv (typically) doesn't.
     *
     * Character code 0xDE is U+00DF Latin Small Letter Sharp S.
     */
    xml = "<doc>\xDE</doc>";

    ctxt = xmlNewParserCtxt();
    xmlCtxtSetCharEncConvImpl(ctxt, icuConvImpl, NULL);
    doc = xmlCtxtReadDoc(ctxt, BAD_CAST xml, NULL, "IBM-1051", 0);
    xmlFreeParserCtxt(ctxt);

    content = xmlNodeGetContent((xmlNodePtr) doc);

    printf("content: %s\n", content);

    if (!xmlStrEqual(content, BAD_CAST "\xC3\x9F")) {
        fprintf(stderr, "conversion failed\n");
        ret = 1;
    }

    xmlFree(content);
    xmlFreeDoc(doc);

    return ret;
}
kc3-lang/libxml2/example/icu.c

Commit

example/icu.c