Commit 47a899ffed3c71080e10e73eda092a716f1be168

Vicent Martí 2012-03-01T21:19:51

filter: Beautiful refactoring Comments soothe my soul.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
diff --git a/src/blob.c b/src/blob.c
index 2453261..e1f4a7f 100644
--- a/src/blob.c
+++ b/src/blob.c
@@ -115,19 +115,18 @@ static int write_file_filtered(
 	if (error < GIT_SUCCESS)
 		return error;
 
-	error = git_filter__apply(&dest, &source, filters);
+	error = git_filters_apply(&dest, &source, filters);
 
-	if (error < GIT_SUCCESS) {
-		git_buf_free(&source);
-		git_buf_free(&dest);
-		return error;
-	}
+	/* Free the source as soon as possible. This can be big in memory,
+	 * and we don't want to ODB write to choke */
+	git_buf_free(&source);
 
-	error = git_odb_write(oid, odb, dest.ptr, dest.size, GIT_OBJ_BLOB);
+	if (error == GIT_SUCCESS) {
+		/* Write the file to disk if it was properly filtered */
+		error = git_odb_write(oid, odb, dest.ptr, dest.size, GIT_OBJ_BLOB);
+	}
 
-	git_buf_free(&source);
 	git_buf_free(&dest);
-
 	return GIT_SUCCESS;
 }
 
@@ -186,18 +185,25 @@ int git_blob_create_fromfile(git_oid *oid, git_repository *repo, const char *pat
 		error = write_symlink(oid, odb, full_path.ptr, (size_t)size);
 	} else {
 		git_vector write_filters = GIT_VECTOR_INIT;
+		int filter_count;
 
-		if ((error = git_filter__load_for_file(
-			&write_filters, repo, path, GIT_FILTER_TO_ODB)) < GIT_SUCCESS)
-			goto cleanup;
+		/* Load the filters for writing this file to the ODB */
+		filter_count = git_filters_load(&write_filters, repo, path, GIT_FILTER_TO_ODB);
 
-		if (write_filters.length == 0) {
+		if (filter_count < 0) {
+			/* Negative value means there was a critical error */
+			error = filter_count;
+			goto cleanup;
+		} else if (filter_count == 0) {
+			/* No filters need to be applied to the document: we can stream
+			 * directly from disk */
 			error = write_file_stream(oid, odb, full_path.ptr, size);
 		} else {
+			/* We need to apply one or more filters */
 			error = write_file_filtered(oid, odb, full_path.ptr, &write_filters);
 		}
 
-		git_filter__free(&write_filters);
+		git_filters_free(&write_filters);
 
 		/*
 		 * TODO: eventually support streaming filtered files, for files which are bigger
diff --git a/src/crlf.c b/src/crlf.c
index d8dd1c3..feaa687 100644
--- a/src/crlf.c
+++ b/src/crlf.c
@@ -102,18 +102,74 @@ static int crlf_load_attributes(struct crlf_attrs *ca, git_repository *repo, con
 	return error;
 }
 
-static int crlf_apply_to_odb(git_filter *self, git_buf *dest, const git_buf *source)
+static int drop_crlf(git_buf *dest, const git_buf *source)
 {
+	size_t psize = source->size - 1;
 	size_t i = 0;
+
+	/* Initial scan: see if we can reach the end of the document
+	 * without finding a single carriage return */
+	while (i < psize && source->ptr[i] != '\r')
+		i++;
+
+	/* Clean file? Tell the library to skip this filter */
+	if (i == psize)
+		return -1;
+
+	/* Main scan loop. Keep moving forward until we find a carriage
+	 * return, and then copy the whole chunk to the destination
+	 * buffer.
+	 *
+	 * Note that we only scan until `size - 1`, because we cannot drop a
+	 * carriage return if it's the last character in the file (what a weird
+	 * file, anyway)
+	 */
+	while (i < psize) {
+		size_t org = i;
+
+		while (i < psize && source->ptr[i] != '\r')
+			i++;
+
+		if (i > org)
+			git_buf_put(dest, source->ptr + org, i - org);
+
+		/* We found a carriage return. Is the next character a newline?
+		 * If it is, we just keep moving. The newline will be copied
+		 * to the dest in the next chunk.
+		 *
+		 * If it's not a newline, we need to insert the carriage return
+		 * into the dest buffer, because we don't drop lone CRs.
+		 */
+		if (source->ptr[i + 1] != '\n') {
+			git_buf_putc(dest, '\r');
+		}
+		
+		i++;
+	}
+
+	/* Copy the last character in the file */
+	git_buf_putc(dest, source->ptr[psize]);
+	return 0;
+}
+
+static int crlf_apply_to_odb(git_filter *self, git_buf *dest, const git_buf *source)
+{
 	struct crlf_filter *filter = (struct crlf_filter *)self;
 
 	assert(self && dest && source);
 
+	/* Empty file? Nothing to do */
+	if (source->size == 0)
+		return 0;
+
+	/* Heuristics to see if we can skip the conversion.
+	 * Straight from Core Git.
+	 */
 	if (filter->attrs.crlf_action == GIT_CRLF_AUTO ||
 		filter->attrs.crlf_action == GIT_CRLF_GUESS) {
 
 		git_text_stats stats;
-		git_text__stat(&stats, source);
+		git_text_gather_stats(&stats, source);
 
 		/*
 		 * We're currently not going to even try to convert stuff
@@ -126,7 +182,7 @@ static int crlf_apply_to_odb(git_filter *self, git_buf *dest, const git_buf *sou
 		/*
 		 * And add some heuristics for binary vs text, of course...
 		 */
-		if (git_text__is_binary(&stats))
+		if (git_text_is_binary(&stats))
 			return -1;
 
 #if 0
@@ -144,50 +200,42 @@ static int crlf_apply_to_odb(git_filter *self, git_buf *dest, const git_buf *sou
 			return -1;
 	}
 
-	/* TODO: do not copy anything if there isn't a single CR */
-	while (i < source->size) {
-		size_t org = i;
-
-		while (i < source->size && source->ptr[i] != '\r')
-			i++;
-
-		if (i > org)
-			git_buf_put(dest, source->ptr + org, i - org);
-
-		i++;
-
-		if (i >= source->size || source->ptr[i] != '\n') {
-			git_buf_putc(dest, '\r');
-		}
-	}
-
-	return 0;
+	/* Actually drop the carriage returns */
+	return drop_crlf(dest, source);
 }
 
-int git_filter__crlf_to_odb(git_filter **filter_out, git_repository *repo, const char *path)
+int git_filter_add__crlf_to_odb(git_vector *filters, git_repository *repo, const char *path)
 {
-	struct crlf_filter filter;
+	struct crlf_attrs ca;
+	struct crlf_filter *filter;
 	int error;
 
-	filter.f.apply = &crlf_apply_to_odb;
-	filter.f.do_free = NULL;
-
-	if ((error = crlf_load_attributes(&filter.attrs, repo, path)) < 0)
+	/* Load gitattributes for the path */
+	if ((error = crlf_load_attributes(&ca, repo, path)) < 0)
 		return error;
 
-	filter.attrs.crlf_action = crlf_input_action(&filter.attrs);
+	/*
+	 * Use the core Git logic to see if we should perform CRLF for this file
+	 * based on its attributes & the value of `core.auto_crlf`
+	 */
+	ca.crlf_action = crlf_input_action(&ca);
 
-	if (filter.attrs.crlf_action == GIT_CRLF_BINARY)
+	if (ca.crlf_action == GIT_CRLF_BINARY)
 		return 0;
 
-	if (filter.attrs.crlf_action == GIT_CRLF_GUESS && repo->filter_options.auto_crlf == GIT_AUTO_CRLF_FALSE)
+	if (ca.crlf_action == GIT_CRLF_GUESS && repo->filter_options.auto_crlf == GIT_AUTO_CRLF_FALSE)
 		return 0;
 
-	*filter_out = git__malloc(sizeof(struct crlf_filter));
-	if (*filter_out == NULL)
+	/* If we're good, we create a new filter object and push it
+	 * into the filters array */
+	filter = git__malloc(sizeof(struct crlf_filter));
+	if (filter == NULL)
 		return GIT_ENOMEM;
 
-	memcpy(*filter_out, &filter, sizeof(struct crlf_attrs));
-	return 0;
+	filter->f.apply = &crlf_apply_to_odb;
+	filter->f.do_free = NULL;
+	memcpy(&filter->attrs, &ca, sizeof(struct crlf_attrs));
+
+	return git_vector_insert(filters, filter);
 }
 
diff --git a/src/filter.c b/src/filter.c
index f517512..92b3566 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -13,7 +13,7 @@
 #include "git2/config.h"
 
 /* Fresh from Core Git. I wonder what we could use this for... */
-void git_text__stat(git_text_stats *stats, const git_buf *text)
+void git_text_gather_stats(git_text_stats *stats, const git_buf *text)
 {
 	size_t i;
 
@@ -65,7 +65,7 @@ void git_text__stat(git_text_stats *stats, const git_buf *text)
 /*
  * Fresh from Core Git
  */
-int git_text__is_binary(git_text_stats *stats)
+int git_text_is_binary(git_text_stats *stats)
 {
 	if (stats->nul)
 		return 1;
@@ -84,32 +84,74 @@ int git_text__is_binary(git_text_stats *stats)
 	return 0;
 }
 
-int git_filter__load_for_file(git_vector *filters, git_repository *repo, const char *path, int mode)
+static int load_repository_settings(git_repository *repo)
+{
+	static git_cvar_map map_eol[] = {
+		{GIT_CVAR_FALSE, NULL, GIT_EOL_UNSET},
+		{GIT_CVAR_STRING, "lf", GIT_EOL_LF},
+		{GIT_CVAR_STRING, "crlf", GIT_EOL_CRLF},
+		{GIT_CVAR_STRING, "native", GIT_EOL_NATIVE}
+	};
+
+	static git_cvar_map map_crlf[] = {
+		{GIT_CVAR_FALSE, NULL, GIT_AUTO_CRLF_FALSE},
+		{GIT_CVAR_TRUE, NULL, GIT_AUTO_CRLF_TRUE},
+		{GIT_CVAR_STRING, "input", GIT_AUTO_CRLF_INPUT}
+	};
+
+	git_config *config;
+	int error;
+
+	if (repo->filter_options.loaded)
+		return GIT_SUCCESS;
+
+	repo->filter_options.eol = GIT_EOL_DEFAULT;
+	repo->filter_options.auto_crlf = GIT_AUTO_CRLF_DEFAULT;
+
+	error = git_repository_config__weakptr(&config, repo);
+	if (error < GIT_SUCCESS)
+		return error;
+
+	error = git_config_get_mapped(
+		config, "core.eol", map_eol, ARRAY_SIZE(map_eol), &repo->filter_options.eol);
+
+	if (error < GIT_SUCCESS && error != GIT_ENOTFOUND)
+		return error;
+
+	error = git_config_get_mapped(
+		config, "core.auto_crlf", map_crlf, ARRAY_SIZE(map_crlf), &repo->filter_options.auto_crlf);
+
+	if (error < GIT_SUCCESS && error != GIT_ENOTFOUND)
+		return error;
+
+	repo->filter_options.loaded = 1;
+	return 0;
+}
+
+int git_filters_load(git_vector *filters, git_repository *repo, const char *path, int mode)
 {
 	int error;
-	git_filter *crlf_filter = NULL;
 
-	error = git_filter__load_settings(repo);
+	/* Make sure that the relevant settings from `gitconfig` have been
+	 * cached on the repository struct to speed things up */
+	error = load_repository_settings(repo);
 	if (error < GIT_SUCCESS)
 		return error;
 
 	if (mode == GIT_FILTER_TO_ODB) {
-		error = git_filter__crlf_to_odb(&crlf_filter, repo, path);
+		/* Load the CRLF cleanup filter when writing to the ODB */
+		error = git_filter_add__crlf_to_odb(filters, repo, path);
 		if (error < GIT_SUCCESS)
 			return error;
-
-		if (crlf_filter != NULL)
-			git_vector_insert(filters, crlf_filter);
-
 	} else {
 		return git__throw(GIT_ENOTIMPLEMENTED,
 			"Worktree filters are not implemented yet");
 	}
 
-	return 0;
+	return (int)filters->length;
 }
 
-void git_filter__free(git_vector *filters)
+void git_filters_free(git_vector *filters)
 {
 	size_t i;
 	git_filter *filter;
@@ -124,7 +166,7 @@ void git_filter__free(git_vector *filters)
 	git_vector_free(filters);
 }
 
-int git_filter__apply(git_buf *dest, git_buf *source, git_vector *filters)
+int git_filters_apply(git_buf *dest, git_buf *source, git_vector *filters)
 {
 	unsigned int src, dst, i;
 	git_buf *dbuffer[2];
@@ -134,6 +176,11 @@ int git_filter__apply(git_buf *dest, git_buf *source, git_vector *filters)
 
 	src = 0;
 
+	if (source->size == 0) {
+		git_buf_clear(dest);
+		return GIT_SUCCESS;
+	}
+
 	/* Pre-grow the destination buffer to more or less the size
 	 * we expect it to have */
 	if (git_buf_grow(dest, source->size) < 0)
@@ -167,46 +214,3 @@ int git_filter__apply(git_buf *dest, git_buf *source, git_vector *filters)
 	return GIT_SUCCESS;
 }
 
-int git_filter__load_settings(git_repository *repo)
-{
-	static git_cvar_map map_eol[] = {
-		{GIT_CVAR_FALSE, NULL, GIT_EOL_UNSET},
-		{GIT_CVAR_STRING, "lf", GIT_EOL_LF},
-		{GIT_CVAR_STRING, "crlf", GIT_EOL_CRLF},
-		{GIT_CVAR_STRING, "native", GIT_EOL_NATIVE}
-	};
-
-	static git_cvar_map map_crlf[] = {
-		{GIT_CVAR_FALSE, NULL, GIT_AUTO_CRLF_FALSE},
-		{GIT_CVAR_TRUE, NULL, GIT_AUTO_CRLF_TRUE},
-		{GIT_CVAR_STRING, "input", GIT_AUTO_CRLF_INPUT}
-	};
-
-	git_config *config;
-	int error;
-
-	if (repo->filter_options.loaded)
-		return GIT_SUCCESS;
-
-	repo->filter_options.eol = GIT_EOL_DEFAULT;
-	repo->filter_options.auto_crlf = GIT_AUTO_CRLF_DEFAULT;
-
-	error = git_repository_config__weakptr(&config, repo);
-	if (error < GIT_SUCCESS)
-		return error;
-
-	error = git_config_get_mapped(
-		config, "core.eol", map_eol, ARRAY_SIZE(map_eol), &repo->filter_options.eol);
-
-	if (error < GIT_SUCCESS && error != GIT_ENOTFOUND)
-		return error;
-
-	error = git_config_get_mapped(
-		config, "core.auto_crlf", map_crlf, ARRAY_SIZE(map_crlf), &repo->filter_options.auto_crlf);
-
-	if (error < GIT_SUCCESS && error != GIT_ENOTFOUND)
-		return error;
-
-	repo->filter_options.loaded = 1;
-	return 0;
-}
diff --git a/src/filter.h b/src/filter.h
index 0cf92bd..601be18 100644
--- a/src/filter.h
+++ b/src/filter.h
@@ -60,19 +60,81 @@ typedef struct {
 	unsigned int printable, nonprintable;
 } git_text_stats;
 
-extern int git_filter__load_settings(git_repository *repo);
-extern int git_filter__load_for_file(git_vector *filters, git_repository *repo, const char *full_path, int mode);
-extern void git_filter__free(git_vector *filters);
-extern int git_filter__apply(git_buf *dest, git_buf *source, git_vector *filters);
+/*
+ * FILTER API
+ */
+
+/*
+ * For any given path in the working directory, fill the `filters`
+ * array with the relevant filters that need to be applied.
+ *
+ * Mode is either `GIT_FILTER_TO_WORKTREE` if you need to load the
+ * filters that will be used when checking out a file to the working
+ * directory, or `GIT_FILTER_TO_ODB` for the filters used when writing
+ * a file to the ODB.
+ *
+ * @param filters Vector where to store all the loaded filters
+ * @param repo Repository object that contains `path`
+ * @param path Relative path of the file to be filtered
+ * @param mode Filtering direction (WT->ODB or ODB->WT)
+ * @return the number of filters loaded for the file (0 if the file
+ *	doesn't need filtering), or a negative error code
+ */
+extern int git_filters_load(git_vector *filters, git_repository *repo, const char *path, int mode);
+
+/*
+ * Apply one or more filters to a file.
+ *
+ * The file must have been loaded as a `git_buf` object. Both the `source`
+ * and `dest` buffers are owned by the caller and must be freed once
+ * they are no longer needed.
+ *
+ * NOTE: Because of the double-buffering schema, the `source` buffer that contains
+ * the original file may be tampered once the filtering is complete. Regardless, 
+ * the `dest` buffer will always contain the final result of the filtering
+ *
+ * @param dest Buffer to store the result of the filtering
+ * @param source Buffer containing the document to filter
+ * @param filters A non-empty vector of filters as supplied by `git_filters_load`
+ * @return GIT_SUCCESS on success, an error code otherwise
+ */
+extern int git_filters_apply(git_buf *dest, git_buf *source, git_vector *filters);
+
+/*
+ * Free the `filters` array generated by `git_filters_load`.
+ *
+ * Note that this frees both the array and its contents. The array will
+ * be clean/reusable after this call.
+ *
+ * @param filters A filters array as supplied by `git_filters_load`
+ */
+extern void git_filters_free(git_vector *filters);
+
+/*
+ * Available filters
+ */
 
-/* Gather stats for a piece of text */
-extern void git_text__stat(git_text_stats *stats, const git_buf *text);
+/* Strip CRLF, from Worktree to ODB */
+extern int git_filter_add__crlf_to_odb(git_vector *filters, git_repository *repo, const char *path);
 
-/* Heuristics on a set of text stats to check whether it's binary
- * text or not */
-extern int git_text__is_binary(git_text_stats *stats);
 
-/* Available filters */
-extern int git_filter__crlf_to_odb(git_filter **filter_out, git_repository *repo, const char *path);
+/*
+ * PLAINTEXT API
+ */
+
+/*
+ * Gather stats for a piece of text
+ *
+ * Fill the `stats` structure with information on the number of
+ * unreadable characters, carriage returns, etc, so it can be
+ * used in heuristics.
+ */
+extern void git_text_gather_stats(git_text_stats *stats, const git_buf *text);
+
+/*
+ * Process `git_text_stats` data generated by `git_text_stat` to see
+ * if it qualifies as a binary file
+ */
+extern int git_text_is_binary(git_text_stats *stats);
 
 #endif