Commit d958e37a48c693a1907bede563bd835935d23499

Russell Belfer 2013-05-17T17:21:45

Fix issues with git_diff_find_similar There are a number of bugs in the rename code that only were obvious when I started testing it against large old repos with more complex patterns. (The code to do that testing is not ready to merge with libgit2, but I do plan to add more thorough tests.) This contains a significant number of changes and also tweaks the public API slightly to make emulating core git easier. Most notably, this separates the GIT_DIFF_FIND_AND_BREAK_REWRITES flag into FIND_REWRITES (which adds a self-similarity score to every modified file) and BREAK_REWRITES (which splits the modified deltas into add/remove pairs in the diff list). When you do a raw output of core git, rewrites show up as M090 or such, not at A and D output, so I wanted to be able to emulate that. Publicly, this also changes the flags to be uint16_t since we don't need values out of that range. Internally, this contains significant changes from a number of small bug fixes (like using the wrong side of the diff to decide if the object could be found in the ODB vs the workdir) to larger issues about which files can and should be compared and how the various edge cases of similarity scores should be treated. Honestly, I don't think this is the last update that will have to be made to this code, but I think this moves us closer to correct behavior and I tried to document the code so it would be easier to follow..

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
diff --git a/include/git2/diff.h b/include/git2/diff.h
index 54966f1..172aa11 100644
--- a/include/git2/diff.h
+++ b/include/git2/diff.h
@@ -243,6 +243,19 @@ typedef struct {
  * `NOT_BINARY` flag set to avoid examining file contents if you do not pass
  * in hunk and/or line callbacks to the diff foreach iteration function.  It
  * will just use the git attributes for those files.
+ *
+ * The similarity score is zero unless you call `git_diff_find_similar()`
+ * which does a similarity analysis of files in the diff.  Use that
+ * function to do rename and copy detection, and to split heavily modified
+ * files in add/delete pairs.  After that call, deltas with a status of
+ * GIT_DELTA_RENAMED or GIT_DELTA_COPIED will have a similarity score
+ * between 0 and 100 indicating how similar the old and new sides are.
+ *
+ * If you ask `git_diff_find_similar` to find heavily modified files to
+ * break, but to not *actually* break the records, then GIT_DELTA_MODIFIED
+ * records may have a non-zero similarity score if the self-similarity is
+ * below the split threshold.  To display this value like core Git, invert
+ * the score (a la `printf("M%03d", 100 - delta->similarity)`).
  */
 typedef struct {
 	git_diff_file old_file;
@@ -408,18 +421,26 @@ typedef enum {
 	/** consider unmodified as copy sources? (`--find-copies-harder`) */
 	GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED = (1 << 3),
 
-	/** split large rewrites into delete/add pairs (`--break-rewrites=/M`) */
-	GIT_DIFF_FIND_AND_BREAK_REWRITES = (1 << 4),
+	/** mark large rewrites for split (`--break-rewrites=/M`) */
+	GIT_DIFF_FIND_REWRITES = (1 << 4),
+	/** actually split large rewrites into delete/add pairs */
+	GIT_DIFF_BREAK_REWRITES = (1 << 5),
+	/** mark rewrites for split and break into delete/add pairs */
+	GIT_DIFF_FIND_AND_BREAK_REWRITES =
+		(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES),
+
+	/** consider untracked files as rename/copy targets */
+	GIT_DIFF_FIND_FROM_UNTRACKED = (1 << 6),
 
 	/** turn on all finding features */
-	GIT_DIFF_FIND_ALL = (0x1f),
+	GIT_DIFF_FIND_ALL = (0x0ff),
 
 	/** measure similarity ignoring leading whitespace (default) */
 	GIT_DIFF_FIND_IGNORE_LEADING_WHITESPACE = 0,
 	/** measure similarity ignoring all whitespace */
-	GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 6),
+	GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 12),
 	/** measure similarity including all data */
-	GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 7),
+	GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 13),
 } git_diff_find_t;
 
 /**
@@ -446,7 +467,7 @@ typedef struct {
  * - `copy_threshold` is the same as the -C option with a value
  * - `rename_from_rewrite_threshold` matches the top of the -B option
  * - `break_rewrite_threshold` matches the bottom of the -B option
- * - `target_limit` matches the -l option
+ * - `target_limit` matches the -l option (approximately)
  *
  * The `metric` option allows you to plug in a custom similarity metric.
  * Set it to NULL for the default internal metric which is based on sampling
@@ -461,18 +482,18 @@ typedef struct {
 	unsigned int flags;
 
 	/** Similarity to consider a file renamed (default 50) */
-	unsigned int rename_threshold;
+	uint16_t rename_threshold;
 	/** Similarity of modified to be eligible rename source (default 50) */
-	unsigned int rename_from_rewrite_threshold;
+	uint16_t rename_from_rewrite_threshold;
 	/** Similarity to consider a file a copy (default 50) */
-	unsigned int copy_threshold;
+	uint16_t copy_threshold;
 	/** Similarity to split modify into delete/add pair (default 60) */
-	unsigned int break_rewrite_threshold;
+	uint16_t break_rewrite_threshold;
 
 	/** Maximum similarity sources to examine (a la diff's `-l` option or
 	 *  the `diff.renameLimit` config) (default 200)
 	 */
-	unsigned int target_limit;
+	size_t target_limit;
 
 	/** Pluggable similarity metric; pass NULL to use internal metric */
 	git_diff_similarity_metric *metric;
diff --git a/src/diff_tform.c b/src/diff_tform.c
index 84650a3..33268e4 100644
--- a/src/diff_tform.c
+++ b/src/diff_tform.c
@@ -19,11 +19,13 @@ static git_diff_delta *diff_delta__dup(
 
 	memcpy(delta, d, sizeof(git_diff_delta));
 
-	delta->old_file.path = git_pool_strdup(pool, d->old_file.path);
-	if (delta->old_file.path == NULL)
-		goto fail;
+	if (d->old_file.path != NULL) {
+		delta->old_file.path = git_pool_strdup(pool, d->old_file.path);
+		if (delta->old_file.path == NULL)
+			goto fail;
+	}
 
-	if (d->new_file.path != d->old_file.path) {
+	if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) {
 		delta->new_file.path = git_pool_strdup(pool, d->new_file.path);
 		if (delta->new_file.path == NULL)
 			goto fail;
@@ -259,6 +261,9 @@ static int normalize_find_opts(
 	if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)
 		opts->flags |= GIT_DIFF_FIND_COPIES;
 
+	if (opts->flags & GIT_DIFF_BREAK_REWRITES)
+		opts->flags |= GIT_DIFF_FIND_REWRITES;
+
 #define USE_DEFAULT(X) ((X) == 0 || (X) > 100)
 
 	if (USE_DEFAULT(opts->rename_threshold))
@@ -307,11 +312,33 @@ static int normalize_find_opts(
 	return 0;
 }
 
-static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size)
+static void validate_delta(git_diff_delta *delta)
+{
+	assert(delta);
+	return;
+/*
+	switch (delta->status) {
+	case GIT_DELTA_ADDED:
+	case GIT_DELTA_UNTRACKED:
+	case GIT_DELTA_IGNORED:
+		assert(delta->new_file.path);
+		break;
+	case GIT_DELTA_DELETED:
+		assert(delta->old_file.path);
+		break;
+	default:
+		assert(delta->old_file.path && delta->new_file.path);
+		break;
+	}
+*/
+}
+
+static int apply_splits_and_deletes(
+	git_diff_list *diff, size_t expected_size, bool actually_split)
 {
 	git_vector onto = GIT_VECTOR_INIT;
 	size_t i;
-	git_diff_delta *delta;
+	git_diff_delta *delta, *deleted;
 
 	if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0)
 		return -1;
@@ -322,14 +349,26 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size)
 			continue;
 
 		if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) {
-			git_diff_delta *deleted = diff_delta__dup(delta, &diff->pool);
-			if (!deleted)
+
+			/* just leave delta flagged with score if not actually splitting */
+			if (!actually_split) {
+				delta->flags = (delta->flags & ~GIT_DIFF_FLAG__TO_SPLIT);
+				if (delta->status != GIT_DELTA_MODIFIED)
+					delta->similarity = 0;
+				continue;
+			}
+
+			delta->similarity = 0;
+
+			/* make new record for DELETED side of split */
+			if (!(deleted = diff_delta__dup(delta, &diff->pool)))
 				goto on_error;
 
 			deleted->status = GIT_DELTA_DELETED;
 			memset(&deleted->new_file, 0, sizeof(deleted->new_file));
 			deleted->new_file.path = deleted->old_file.path;
 			deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_OID;
+			validate_delta(deleted);
 
 			if (git_vector_insert(&onto, deleted) < 0)
 				goto on_error;
@@ -338,6 +377,7 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size)
 			memset(&delta->old_file, 0, sizeof(delta->old_file));
 			delta->old_file.path = delta->new_file.path;
 			delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
+			validate_delta(delta);
 		}
 
 		if (git_vector_insert(&onto, delta) < 0)
@@ -350,7 +390,6 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size)
 			git__free(delta);
 
 	/* swap new delta list into place */
-	git_vector_sort(&onto);
 	git_vector_swap(&diff->deltas, &onto);
 	git_vector_free(&onto);
 
@@ -359,7 +398,6 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size)
 on_error:
 	git_vector_foreach(&onto, i, delta)
 		git__free(delta);
-
 	git_vector_free(&onto);
 
 	return -1;
@@ -379,7 +417,7 @@ static int similarity_calc(
 {
 	int error = 0;
 	git_diff_file *file = similarity_get_file(diff, file_idx);
-	git_iterator_type_t src = (file_idx & 1) ? diff->old_src : diff->new_src;
+	git_iterator_type_t src = (file_idx & 1) ? diff->new_src : diff->old_src;
 
 	if (src == GIT_ITERATOR_TYPE_WORKDIR) { /* compute hashsig from file */
 		git_buf path = GIT_BUF_INIT;
@@ -455,8 +493,8 @@ static int similarity_measure(
 		return -1;
 
 	/* clip score */
-	if (score < 0)
-		score = 0;
+	if (score < 1)
+		score = 1; /* zero means uncomparable, so use 1 for least similar */
 	else if (score > 100)
 		score = 100;
 
@@ -465,36 +503,50 @@ static int similarity_measure(
 
 #define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0)
 
+typedef struct {
+	uint32_t idx;
+	uint32_t similarity;
+} diff_find_match;
+
 int git_diff_find_similar(
 	git_diff_list *diff,
 	git_diff_find_options *given_opts)
 {
-	size_t i, j, cache_size, *matches;
+	size_t i, j, cache_size;
 	int error = 0, similarity;
 	git_diff_delta *from, *to;
 	git_diff_find_options opts;
-	size_t tried_targets, num_rewrites = 0;
-	void **cache;
+	size_t num_rewrites = 0, num_updates = 0;
+	void **cache; /* cache of similarity metric file signatures */
+	diff_find_match *matches; /* cache of best matches */
 
 	if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
 		return error;
 
 	/* TODO: maybe abort if deltas.length > target_limit ??? */
+	if (!git__is_uint32(diff->deltas.length))
+		return 0;
 
 	cache_size = diff->deltas.length * 2; /* must store b/c length may change */
 	cache = git__calloc(cache_size, sizeof(void *));
 	GITERR_CHECK_ALLOC(cache);
 
-	matches = git__calloc(diff->deltas.length, sizeof(size_t));
+	matches = git__calloc(diff->deltas.length, sizeof(diff_find_match));
 	GITERR_CHECK_ALLOC(matches);
 
-	/* first break MODIFIED records that are too different (if requested) */
+	/* first mark MODIFIED deltas to split if too different (if requested) */
 
-	if (FLAG_SET(opts, GIT_DIFF_FIND_AND_BREAK_REWRITES)) {
+	if (FLAG_SET(opts, GIT_DIFF_FIND_REWRITES)) {
 		git_vector_foreach(&diff->deltas, i, from) {
 			if (from->status != GIT_DELTA_MODIFIED)
 				continue;
 
+			/* skip things that aren't plain blobs */
+			if (GIT_MODE_TYPE(from->old_file.mode) !=
+				GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
+				continue;
+
+			/* measure similarity from old_file to new_file */
 			similarity = similarity_measure(
 				diff, &opts, cache, 2 * i, 2 * i + 1);
 
@@ -503,7 +555,9 @@ int git_diff_find_similar(
 				goto cleanup;
 			}
 
-			if ((unsigned int)similarity < opts.break_rewrite_threshold) {
+			if (similarity > 0 &&
+				similarity < (int)opts.break_rewrite_threshold) {
+				from->similarity = (uint32_t)similarity;
 				from->flags |= GIT_DIFF_FLAG__TO_SPLIT;
 				num_rewrites++;
 			}
@@ -513,9 +567,12 @@ int git_diff_find_similar(
 	/* next find the most similar delta for each rename / copy candidate */
 
 	git_vector_foreach(&diff->deltas, i, from) {
-		tried_targets = 0;
+		size_t tried_targets = 0;
+
+		matches[i].idx = i;
+		matches[i].similarity = 0;
 
-		/* skip things that aren't blobs */
+		/* skip things that aren't plain blobs */
 		if (GIT_MODE_TYPE(from->old_file.mode) !=
 			GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
 			continue;
@@ -525,7 +582,13 @@ int git_diff_find_similar(
 			!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED))
 			continue;
 
-		/* skip all but DELETED files unless copy detection is on */
+		/* don't check UNTRACKED files as source unless given option */
+		if ((from->status == GIT_DELTA_UNTRACKED ||
+			 from->status == GIT_DELTA_IGNORED) &&
+			!FLAG_SET(opts, GIT_DIFF_FIND_FROM_UNTRACKED))
+			continue;
+
+		/* only use DELETED (or split MODIFIED) unless copy detection on */
 		if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES) &&
 			from->status != GIT_DELTA_DELETED &&
 			(from->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0)
@@ -540,9 +603,11 @@ int git_diff_find_similar(
 				GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
 				continue;
 
+			/* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
+			 * targets; maybe include UNTRACKED and IGNORED if requested.
+			 */
 			switch (to->status) {
 			case GIT_DELTA_ADDED:
-			case GIT_DELTA_UNTRACKED:
 			case GIT_DELTA_RENAMED:
 			case GIT_DELTA_COPIED:
 				break;
@@ -550,18 +615,21 @@ int git_diff_find_similar(
 				if ((to->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0)
 					continue;
 				break;
+			case GIT_DELTA_UNTRACKED:
+			case GIT_DELTA_IGNORED:
+				if (!FLAG_SET(opts, GIT_DIFF_FIND_FROM_UNTRACKED))
+					continue;
+				break;
 			default:
-				/* only the above status values should be checked */
+				/* all other status values will be skipped */
 				continue;
 			}
 
-			/* cap on maximum files we'll examine (per "from" file) */
+			/* cap on maximum targets we'll examine (per "from" file) */
 			if (++tried_targets > opts.target_limit)
 				break;
 
-			/* calculate similarity and see if this pair beats the
-			 * similarity score of the current best pair.
-			 */
+			/* calculate similarity for this pair and find best match */
 			similarity = similarity_measure(
 				diff, &opts, cache, 2 * i, 2 * j + 1);
 
@@ -570,112 +638,133 @@ int git_diff_find_similar(
 				goto cleanup;
 			}
 
-			if (to->similarity < (unsigned int)similarity) {
-				to->similarity = (unsigned int)similarity;
-				matches[j] = i + 1;
+			if (matches[i].similarity < (uint32_t)similarity) {
+				matches[i].similarity = (uint32_t)similarity;
+				matches[i].idx = j;
 			}
 		}
 	}
 
 	/* next rewrite the diffs with renames / copies */
 
-	git_vector_foreach(&diff->deltas, j, to) {
-		if (!matches[j]) {
-			assert(to->similarity == 0);
+	git_vector_foreach(&diff->deltas, i, from) {
+		if (!matches[i].similarity)
 			continue;
-		}
 
-		i = matches[j] - 1;
-		from = GIT_VECTOR_GET(&diff->deltas, i);
-		assert(from);
-
-		/* four possible outcomes here:
-		 * 1. old DELETED and if over rename threshold,
-		 *    new becomes RENAMED and old goes away
-		 * 2. old SPLIT and if over rename threshold,
-		 *    new becomes RENAMED and old becomes ADDED (clear SPLIT)
-		 * 3. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and
-		 *    old is more similar to new than it is to itself, in which
-		 *    case, new becomes RENAMED and old becomed ADDED
-		 * 4. otherwise if over copy threshold, new becomes COPIED
+		to = GIT_VECTOR_GET(&diff->deltas, matches[i].idx);
+		assert(to);
+
+		similarity = (int)matches[i].similarity;
+
+		/*
+		 * Four possible outcomes here:
 		 */
 
+		/* 1. DELETED "from" with match over rename threshold becomes
+		 *    RENAMED "from" record (and "to" record goes away)
+		 */
 		if (from->status == GIT_DELTA_DELETED) {
-			if (to->similarity < opts.rename_threshold) {
-				to->similarity = 0;
+			if (similarity < (int)opts.rename_threshold)
 				continue;
-			}
 
-			to->status = GIT_DELTA_RENAMED;
-			memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
+			to->flags |= GIT_DIFF_FLAG__TO_DELETE;
 
-			from->flags |= GIT_DIFF_FLAG__TO_DELETE;
-			num_rewrites++;
+			from->status = GIT_DELTA_RENAMED;
+			from->similarity = (uint32_t)similarity;
+			memcpy(&from->new_file, &to->new_file, sizeof(to->new_file));
+			validate_delta(from);
 
+			num_rewrites++;
 			continue;
 		}
 
+		/* 2. SPLIT MODIFIED "from" with match over rename threshold becomes
+		 *    ADDED "from" record (with no SPLIT) and RENAMED "to" record
+		 */
 		if (from->status == GIT_DELTA_MODIFIED &&
-			(from->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0)
-		{
-			if (to->similarity < opts.rename_threshold) {
-				to->similarity = 0;
+			(from->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) {
+
+			if (similarity < (int)opts.rename_threshold)
 				continue;
-			}
 
 			to->status = GIT_DELTA_RENAMED;
+			to->similarity = (uint32_t)similarity;
 			memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
+			validate_delta(to);
 
 			from->status = GIT_DELTA_ADDED;
 			from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
+			from->similarity = 0; /* reset self-similarity */
 			memset(&from->old_file, 0, sizeof(from->old_file));
-			num_rewrites--;
+			from->old_file.path = from->new_file.path;
+			validate_delta(from);
 
+			num_rewrites--;
+			num_updates++;
 			continue;
 		}
 
+		/* 3. MODIFIED "from" with FIND_RENAMES_FROM_REWRITES with similar
+		 *    "to" and self-similarity below rename_from_rewrite_threshold
+		 *    becomes newly ADDED "from" and RENAMED "to".
+		 */
 		if (from->status == GIT_DELTA_MODIFIED &&
 			FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
-			to->similarity > opts.rename_threshold)
+			similarity > (int)opts.rename_threshold)
 		{
-			similarity = similarity_measure(
+			int self_similarity = similarity_measure(
 				diff, &opts, cache, 2 * i, 2 * i + 1);
-
-			if (similarity < 0) {
-				error = similarity;
+			if (self_similarity < 0) {
+				error = self_similarity;
 				goto cleanup;
 			}
 
-			if ((unsigned int)similarity < opts.rename_from_rewrite_threshold) {
+			if (self_similarity < (int)opts.rename_from_rewrite_threshold) {
 				to->status = GIT_DELTA_RENAMED;
+				to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
+				to->similarity = (uint32_t)similarity;
 				memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
+				validate_delta(to);
 
 				from->status = GIT_DELTA_ADDED;
+				from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
+				from->similarity = 0;
 				memset(&from->old_file, 0, sizeof(from->old_file));
-				from->old_file.path = to->old_file.path;
+				from->old_file.path = from->new_file.path;
 				from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
+				validate_delta(from);
 
+				num_updates++;
 				continue;
 			}
 		}
 
-		if (to->similarity < opts.copy_threshold) {
-			to->similarity = 0;
+		/* 4. if "from" -> "to" over copy threshold, "to" becomes COPIED */
+		if (similarity < (int)opts.copy_threshold)
 			continue;
-		}
 
 		/* convert "to" to a COPIED record */
 		to->status = GIT_DELTA_COPIED;
+		to->similarity = (uint32_t)similarity;
 		memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
+		validate_delta(to);
+
+		validate_delta(from);
+
+		num_updates++;
 	}
 
 	if (num_rewrites > 0) {
 		assert(num_rewrites < diff->deltas.length);
 
 		error = apply_splits_and_deletes(
-			diff, diff->deltas.length - num_rewrites);
+			diff, diff->deltas.length - num_rewrites,
+			FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES));
 	}
 
+	if (num_rewrites > 0 || num_updates > 0)
+		git_vector_sort(&diff->deltas);
+
 cleanup:
 	git__free(matches);
 
diff --git a/src/util.h b/src/util.h
index 6f876d0..5ae87ac 100644
--- a/src/util.h
+++ b/src/util.h
@@ -109,6 +109,13 @@ GIT_INLINE(int) git__is_sizet(git_off_t p)
 	return p == (git_off_t)r;
 }
 
+/** @return true if p fits into the range of a uint32_t */
+GIT_INLINE(int) git__is_uint32(size_t p)
+{
+	uint32_t r = (uint32_t)p;
+	return p == (size_t)r;
+}
+
 /* 32-bit cross-platform rotl */
 #ifdef _MSC_VER /* use built-in method in MSVC */
 #	define git__rotl(v, s) (uint32_t)_rotl(v, s)
diff --git a/tests-clar/diff/rename.c b/tests-clar/diff/rename.c
index 1349d40..01f65ab 100644
--- a/tests-clar/diff/rename.c
+++ b/tests-clar/diff/rename.c
@@ -377,7 +377,8 @@ void test_diff_rename__handles_small_files(void)
 	 */
 	cl_git_pass(git_diff_tree_to_index(&diff, g_repo, tree, index, &diffopts));
 
-	opts.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES | GIT_DIFF_FIND_AND_BREAK_REWRITES;
+	opts.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES |
+		GIT_DIFF_FIND_AND_BREAK_REWRITES;
 	cl_git_pass(git_diff_find_similar(diff, &opts));
 
 	git_diff_list_free(diff);