Tag
Hash :
b67f3bcb
Author :
Date :
2020-11-21T10:49:25
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
/* Split source by line breaks, and calculate a simplistic checksum. */
/*
* Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <errno.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <ctype.h>
#include <arraylist.h>
#include <diff_main.h>
#include "diff_internal.h"
#include "diff_debug.h"
unsigned int
diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
{
return hash * 23 + atom_byte;
}
static int
diff_data_atomize_text_lines_fd(struct diff_data *d)
{
off_t pos = 0;
const off_t end = pos + d->len;
unsigned int array_size_estimate = d->len / 50;
unsigned int pow2 = 1;
bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
bool embedded_nul = false;
while (array_size_estimate >>= 1)
pow2++;
ARRAYLIST_INIT(d->atoms, 1 << pow2);
if (fseek(d->root->f, 0L, SEEK_SET) == -1)
return errno;
while (pos < end) {
off_t line_end = pos;
unsigned int hash = 0;
unsigned char buf[512];
size_t r, i;
struct diff_atom *atom;
int eol = 0;
while (eol == 0 && line_end < end) {
r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
if (r == 0 && ferror(d->root->f))
return errno;
i = 0;
while (eol == 0 && i < r) {
if (buf[i] != '\r' && buf[i] != '\n') {
if (!ignore_whitespace
|| !isspace(buf[i]))
hash = diff_atom_hash_update(
hash, buf[i]);
if (buf[i] == '\0')
embedded_nul = true;
line_end++;
} else
eol = buf[i];
i++;
}
}
/* When not at the end of data, the line ending char ('\r' or
* '\n') must follow */
if (line_end < end)
line_end++;
/* If that was an '\r', also pull in any following '\n' */
if (line_end < end && eol == '\r') {
if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
return errno;
r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
if (r == 0 && ferror(d->root->f))
return errno;
if (r == 1 && buf[0] == '\n' )
line_end++;
}
/* Record the found line as diff atom */
ARRAYLIST_ADD(atom, d->atoms);
if (!atom)
return ENOMEM;
*atom = (struct diff_atom){
.root = d,
.pos = pos,
.at = NULL, /* atom data is not memory-mapped */
.len = line_end - pos,
.hash = hash,
};
/* Starting point for next line: */
pos = line_end;
if (fseeko(d->root->f, pos, SEEK_SET) == -1)
return errno;
}
/* File are considered binary if they contain embedded '\0' bytes. */
if (embedded_nul)
d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
return DIFF_RC_OK;
}
static int
diff_data_atomize_text_lines_mmap(struct diff_data *d)
{
const uint8_t *pos = d->data;
const uint8_t *end = pos + d->len;
bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
bool embedded_nul = false;
unsigned int array_size_estimate = d->len / 50;
unsigned int pow2 = 1;
while (array_size_estimate >>= 1)
pow2++;
ARRAYLIST_INIT(d->atoms, 1 << pow2);
while (pos < end) {
const uint8_t *line_end = pos;
unsigned int hash = 0;
while (line_end < end && *line_end != '\r' && *line_end != '\n') {
if (!ignore_whitespace
|| !isspace(*line_end))
hash = hash * 23 + *line_end;
if (*line_end == '\0')
embedded_nul = true;
line_end++;
}
/* When not at the end of data, the line ending char ('\r' or
* '\n') must follow */
if (line_end < end)
line_end++;
/* If that was an '\r', also pull in any following '\n' */
if (line_end < end - 1 && line_end[0] == '\r' &&
line_end[1] == '\n')
line_end++;
/* Record the found line as diff atom */
struct diff_atom *atom;
ARRAYLIST_ADD(atom, d->atoms);
if (!atom)
return ENOMEM;
*atom = (struct diff_atom){
.root = d,
.pos = (off_t)(pos - d->data),
.at = pos,
.len = line_end - pos,
.hash = hash,
};
/* Starting point for next line: */
pos = line_end;
}
/* File are considered binary if they contain embedded '\0' bytes. */
if (embedded_nul)
d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
return DIFF_RC_OK;
}
static int
diff_data_atomize_text_lines(struct diff_data *d)
{
if (d->data == NULL)
return diff_data_atomize_text_lines_fd(d);
else
return diff_data_atomize_text_lines_mmap(d);
}
int
diff_atomize_text_by_line(void *func_data, struct diff_data *d)
{
return diff_data_atomize_text_lines(d);
}