kmx git

/* Split source by line breaks, and calculate a simplistic checksum. */
/*
 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <errno.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <ctype.h>

#include <arraylist.h>
#include <diff_main.h>

#include "diff_internal.h"
#include "diff_debug.h"

unsigned int
diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
{
	return hash * 23 + atom_byte;
}

static int
diff_data_atomize_text_lines_fd(struct diff_data *d)
{
	off_t pos = 0;
	const off_t end = pos + d->len;
	unsigned int array_size_estimate = d->len / 50;
	unsigned int pow2 = 1;
	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);

	while (array_size_estimate >>= 1)
		pow2++;

	ARRAYLIST_INIT(d->atoms, 1 << pow2);

	if (fseek(d->root->f, 0L, SEEK_SET) == -1)
		return errno;

	while (pos < end) {
		off_t line_end = pos;
		unsigned int hash = 0;
		unsigned char buf[512];
		size_t r, i;
		struct diff_atom *atom;
		int eol = 0;

		while (eol == 0 && line_end < end) {
			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
			if (r == 0 && ferror(d->root->f))
				return errno;
			i = 0;
			while (eol == 0 && i < r) {
				if (buf[i] != '\r' && buf[i] != '\n') {
					if (!ignore_whitespace
					    || !isspace(buf[i]))
						hash = diff_atom_hash_update(
						    hash, buf[i]);
					line_end++;
				} else
					eol = buf[i];
				i++;
			}
		}

		/* When not at the end of data, the line ending char ('\r' or
		 * '\n') must follow */
		if (line_end < end)
			line_end++;
		/* If that was an '\r', also pull in any following '\n' */
		if (line_end < end && eol == '\r') {
			if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
				return errno;
			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
			if (r == 0 && ferror(d->root->f))
				return errno;
			if (r == 1 && buf[0] == '\n' )
				line_end++;
		}

		/* Record the found line as diff atom */
		ARRAYLIST_ADD(atom, d->atoms);
		if (!atom)
			return ENOMEM;

		*atom = (struct diff_atom){
			.root = d,
			.pos = pos,
			.at = NULL,	/* atom data is not memory-mapped */
			.len = line_end - pos,
			.hash = hash,
		};

		/* Starting point for next line: */
		pos = line_end;
		if (fseeko(d->root->f, pos, SEEK_SET) == -1)
			return errno;
	}

	return DIFF_RC_OK;
}

static int
diff_data_atomize_text_lines_mmap(struct diff_data *d)
{
	const uint8_t *pos = d->data;
	const uint8_t *end = pos + d->len;
	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);

	unsigned int array_size_estimate = d->len / 50;
	unsigned int pow2 = 1;
	while (array_size_estimate >>= 1)
		pow2++;

	ARRAYLIST_INIT(d->atoms, 1 << pow2);

	while (pos < end) {
		const uint8_t *line_end = pos;
		unsigned int hash = 0;

		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
			if (!ignore_whitespace
			    || !isspace(*line_end))
				hash = hash * 23 + *line_end;
			line_end++;
		}

		/* When not at the end of data, the line ending char ('\r' or
		 * '\n') must follow */
		if (line_end < end)
			line_end++;
		/* If that was an '\r', also pull in any following '\n' */
		if (line_end < end - 1 && line_end[0] == '\r' &&
		    line_end[1] == '\n')
			line_end++;

		/* Record the found line as diff atom */
		struct diff_atom *atom;
		ARRAYLIST_ADD(atom, d->atoms);
		if (!atom)
			return ENOMEM;

		*atom = (struct diff_atom){
			.root = d,
			.pos = (off_t)(pos - d->data),
			.at = pos,
			.len = line_end - pos,
			.hash = hash,
		};

		/* Starting point for next line: */
		pos = line_end;
	}

	return DIFF_RC_OK;
}

static int
diff_data_atomize_text_lines(struct diff_data *d)
{
	if (d->data == NULL)
		return diff_data_atomize_text_lines_fd(d);
	else
		return diff_data_atomize_text_lines_mmap(d);
}

int
diff_atomize_text_by_line(void *func_data, struct diff_data *d)
{
	return diff_data_atomize_text_lines(d);
}
thodg/got/lib/diff_atomize_text.c

Commit

lib/diff_atomize_text.c