iterator: saner pathlist matching for idx iterator Some nicer refactoring for index iteration walks. The index iterator doesn't binary search through the pathlist space, since it lacks directory entries, and would have to binary search each index entry and all its parents (eg, when presented with an index entry of `foo/bar/file.c`, you would have to look in the pathlist for `foo/bar/file.c`, `foo/bar` and `foo`). Since the index entries and the pathlist are both nicely sorted, we walk the index entries in lockstep with the pathlist like we do for other iteration/diff/merge walks.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
diff --git a/src/iterator.c b/src/iterator.c
index 9bf56b6..e35c8dc 100644
--- a/src/iterator.c
+++ b/src/iterator.c
@@ -75,7 +75,8 @@ static int iterator_pathlist__init(git_iterator *iter, git_strarray *pathspec)
{
size_t i;
- if (git_vector_init(&iter->pathlist, pathspec->count, iter->strcomp) < 0)
+ if (git_vector_init(&iter->pathlist, pathspec->count,
+ (git_vector_cmp)iter->strcomp) < 0)
return -1;
for (i = 0; i < pathspec->count; i++) {
@@ -98,7 +99,8 @@ static iterator_pathlist__match_t iterator_pathlist__match(
size_t idx;
int error;
- error = git_vector_bsearch2(&idx, &iter->pathlist, iter->strcomp, path);
+ error = git_vector_bsearch2(&idx, &iter->pathlist,
+ (git_vector_cmp)iter->strcomp, path);
if (error == 0)
return ITERATOR_PATHLIST_MATCH;
@@ -116,10 +118,7 @@ static iterator_pathlist__match_t iterator_pathlist__match(
/* is this a literal directory entry (eg `foo/`) or a file beneath */
if (p[path_len] == '/') {
- while (p[path_len] == '/')
- path_len++;
-
- return (p[path_len] == '\0') ?
+ return (p[path_len+1] == '\0') ?
ITERATOR_PATHLIST_MATCH_DIRECTORY :
ITERATOR_PATHLIST_MATCH_CHILD;
}
@@ -133,10 +132,68 @@ static iterator_pathlist__match_t iterator_pathlist__match(
return ITERATOR_PATHLIST_NONE;
}
+static void iterator_pathlist_walk__reset(git_iterator *iter)
+{
+ iter->pathlist_walk_idx = 0;
+}
+
+/* walker for the index iterator that allows it to walk the sorted pathlist
+ * entries alongside the sorted index entries. the `iter->pathlist_walk_idx`
+ * stores the starting position for subsequent calls, the position is advanced
+ * along with the index iterator, with a special case for handling directories
+ * in the pathlist that are specified without trailing '/'. (eg, `foo`).
+ * we do not advance over these entries until we're certain that the index
+ * iterator will not ask us for a file beneath that directory (eg, `foo/bar`).
+ */
+static bool iterator_pathlist_walk__contains(git_iterator *iter, const char *path)
+{
+ size_t i;
+ char *p;
+ size_t p_len;
+ int cmp;
+
+ for (i = iter->pathlist_walk_idx; i < iter->pathlist.length; i++) {
+ p = iter->pathlist.contents[i];
+ p_len = strlen(p);
+
+ /* see if the pathlist entry is a prefix of this path */
+ cmp = iter->strncomp(p, path, p_len);
+
+ /* this pathlist entry sorts before the given path, try the next */
+ if (!p_len || cmp < 0)
+ iter->pathlist_walk_idx++;
+
+ /* this pathlist sorts after the given path, no match. */
+ else if (cmp > 0)
+ return false;
+
+ /* match! an exact match (`foo` vs `foo`), the path is a child of an
+ * explicit directory in the pathlist (`foo/` vs `foo/bar`) or the path
+ * is a child of an entry in the pathlist (`foo` vs `foo/bar`)
+ */
+ else if (path[p_len] == '\0' || p[p_len - 1] == '/' || path[p_len] == '/')
+ return true;
+
+ /* only advance the start index for future callers if we know that we
+ * will not see a child of this path. eg, a pathlist entry `foo` is
+ * a prefix for `foo.txt` and `foo/bar`. don't advance the start
+ * pathlist index when we see `foo.txt` or we would miss a subsequent
+ * inspection of `foo/bar`. only advance when there are no more
+ * potential children.
+ */
+ else if (path[p_len] > '/')
+ iter->pathlist_walk_idx++;
+ }
+
+ return false;
+}
+
static void iterator_pathlist__update_ignore_case(git_iterator *iter)
{
- git_vector_set_cmp(&iter->pathlist, iter->strcomp);
+ git_vector_set_cmp(&iter->pathlist, (git_vector_cmp)iter->strcomp);
git_vector_sort(&iter->pathlist);
+
+ iter->pathlist_walk_idx = 0;
}
@@ -583,13 +640,13 @@ static int tree_iterator__current_internal(
return 0;
}
-int tree_iterator__advance(
+static int tree_iterator__advance(
const git_index_entry **out, git_iterator *self);
static int tree_iterator__current(
const git_index_entry **out, git_iterator *self)
{
- git_index_entry *entry = NULL;
+ const git_index_entry *entry = NULL;
iterator_pathlist__match_t m;
int error;
@@ -797,9 +854,7 @@ static const git_index_entry *index_iterator__advance_over_unwanted(
index_iterator *ii)
{
const git_index_entry *ie = index_iterator__index_entry(ii);
- const char *p;
- size_t p_len;
- int cmp;
+ bool match;
while (ie) {
if (!iterator__include_conflicts(ii) &&
@@ -810,53 +865,17 @@ static const git_index_entry *index_iterator__advance_over_unwanted(
}
/* if we have a pathlist, this entry's path must be in it to be
- * returned. otherwise, advance the pathlist entry or the iterator
- * until we find the next path that we want to return.
+ * returned. walk the pathlist in unison with the index to
+ * compare paths.
*/
if (ii->base.pathlist.length) {
+ match = iterator_pathlist_walk__contains(&ii->base, ie->path);
- if (ii->pathlist_idx >= ii->base.pathlist.length) {
- ii->current = SIZE_MAX;
- ie = NULL;
- break;
- }
-
- p = git_vector_get(&ii->base.pathlist, ii->pathlist_idx);
-
- /* trim trailing slashes that indicate an exact directory match */
- p_len = strlen(p);
-
- while (p_len && p[p_len-1] == '/')
- p_len--;
-
- cmp = ii->base.strncomp(ie->path, p, p_len);
-
- /* we've matched the prefix - if the pathlist entry is equal to
- * this entry, or if the pathlist entry is a folder (eg `foo/`)
- * and this entry was beneath that, then continue. otherwise,
- * sort the index entry path against the pathlist entry.
- */
- if (cmp == 0) {
- if (ie->path[p_len] == 0)
- ;
- else if (ie->path[p_len] == '/')
- ;
- else if (ie->path[p_len] < '/')
- cmp = -1;
- else if (ie->path[p_len] > '/')
- cmp = 1;
- }
-
- if (cmp < 0) {
+ if (!match) {
ii->current++;
ie = index_iterator__index_entry(ii);
continue;
}
-
- if (cmp > 0) {
- ii->pathlist_idx++;
- continue;
- }
}
break;
@@ -1006,7 +1025,8 @@ static int index_iterator__reset(
return -1;
ii->current = 0;
- ii->pathlist_idx = 0;
+
+ iterator_pathlist_walk__reset(self);
/* if we're given a start prefix, find it; if we're given a pathlist, find
* the first of those. start at the later of the two.
@@ -1193,7 +1213,7 @@ static void fs_iterator__seek_frame_start(
ff->index = 0;
}
-static int dirload_with_stat(git_vector *contents, size_t *filtered, fs_iterator *fi)
+static int dirload_with_stat(git_vector *contents, fs_iterator *fi)
{
git_path_diriter diriter = GIT_PATH_DIRITER_INIT;
const char *path;
@@ -1204,8 +1224,6 @@ static int dirload_with_stat(git_vector *contents, size_t *filtered, fs_iterator
iterator_pathlist__match_t pathlist_match = ITERATOR_PATHLIST_MATCH;
int error;
- *filtered = 0;
-
/* Any error here is equivalent to the dir not existing, skip over it */
if ((error = git_path_diriter_init(
&diriter, fi->path.ptr, fi->dirload_flags)) < 0) {
@@ -1241,11 +1259,8 @@ static int dirload_with_stat(git_vector *contents, size_t *filtered, fs_iterator
if (fi->base.pathlist.length &&
fi->pathlist_match != ITERATOR_PATHLIST_MATCH &&
fi->pathlist_match != ITERATOR_PATHLIST_MATCH_DIRECTORY &&
- !(pathlist_match = iterator_pathlist__match(&fi->base, path, path_len))) {
-
- *filtered++;
+ !(pathlist_match = iterator_pathlist__match(&fi->base, path, path_len)))
continue;
- }
/* Make sure to append two bytes, one for the path's null
* termination, one for a possible trailing '/' for folders.
@@ -1313,7 +1328,6 @@ static int fs_iterator__expand_dir(fs_iterator *fi)
{
int error;
fs_iterator_frame *ff;
- size_t filtered = 0;
if (fi->depth > FS_MAX_DEPTH) {
giterr_set(GITERR_REPOSITORY,
@@ -1324,7 +1338,7 @@ static int fs_iterator__expand_dir(fs_iterator *fi)
ff = fs_iterator__alloc_frame(fi);
GITERR_CHECK_ALLOC(ff);
- error = dirload_with_stat(&ff->entries, &filtered, fi);
+ error = dirload_with_stat(&ff->entries, fi);
if (error < 0) {
git_error_state last_error = { 0 };
diff --git a/src/iterator.h b/src/iterator.h
index d2d61fb..59f87e9 100644
--- a/src/iterator.h
+++ b/src/iterator.h
@@ -70,6 +70,7 @@ struct git_iterator {
char *start;
char *end;
git_vector pathlist;
+ size_t pathlist_walk_idx;
int (*strcomp)(const char *a, const char *b);
int (*strncomp)(const char *a, const char *b, size_t n);
int (*prefixcomp)(const char *str, const char *prefix);
diff --git a/tests/diff/workdir.c b/tests/diff/workdir.c
index 336f959..e877691 100644
--- a/tests/diff/workdir.c
+++ b/tests/diff/workdir.c
@@ -581,30 +581,6 @@ void test_diff_workdir__to_index_with_pathlist_disabling_fnmatch(void)
git_diff_free(diff);
- /* ensure that multiple trailing slashes are ignored */
- pathspec = "subdir//////";
-
- cl_git_pass(git_diff_index_to_workdir(&diff, g_repo, NULL, &opts));
-
- for (use_iterator = 0; use_iterator <= 1; use_iterator++) {
- memset(&exp, 0, sizeof(exp));
-
- if (use_iterator)
- cl_git_pass(diff_foreach_via_iterator(
- diff, diff_file_cb, NULL, NULL, NULL, &exp));
- else
- cl_git_pass(git_diff_foreach(diff, diff_file_cb, NULL, NULL, NULL, &exp));
-
- cl_assert_equal_i(3, exp.files);
- cl_assert_equal_i(0, exp.file_status[GIT_DELTA_ADDED]);
- cl_assert_equal_i(1, exp.file_status[GIT_DELTA_DELETED]);
- cl_assert_equal_i(1, exp.file_status[GIT_DELTA_MODIFIED]);
- cl_assert_equal_i(0, exp.file_status[GIT_DELTA_IGNORED]);
- cl_assert_equal_i(1, exp.file_status[GIT_DELTA_UNTRACKED]);
- }
-
- git_diff_free(diff);
-
/* ensure that fnmatching is completely disabled */
pathspec = "subdir/*";
diff --git a/tests/repo/iterator.c b/tests/repo/iterator.c
index cb9d4cd..8eeb7d3 100644
--- a/tests/repo/iterator.c
+++ b/tests/repo/iterator.c
@@ -1162,6 +1162,76 @@ void test_repo_iterator__indexfilelist_2(void)
git_vector_free(&filelist);
}
+void test_repo_iterator__indexfilelist_3(void)
+{
+ git_iterator *i;
+ git_iterator_options i_opts = GIT_ITERATOR_OPTIONS_INIT;
+ git_index *index;
+ git_vector filelist = GIT_VECTOR_INIT;
+
+ g_repo = cl_git_sandbox_init("icase");
+
+ cl_git_pass(git_repository_index(&index, g_repo));
+
+ cl_git_pass(git_vector_init(&filelist, 100, &git__strcmp_cb));
+ cl_git_pass(git_vector_insert(&filelist, "0"));
+ cl_git_pass(git_vector_insert(&filelist, "c"));
+ cl_git_pass(git_vector_insert(&filelist, "D"));
+ cl_git_pass(git_vector_insert(&filelist, "e"));
+ cl_git_pass(git_vector_insert(&filelist, "k/"));
+ cl_git_pass(git_vector_insert(&filelist, "k.a"));
+ cl_git_pass(git_vector_insert(&filelist, "k.b"));
+ cl_git_pass(git_vector_insert(&filelist, "kZZZZZZZ"));
+
+ i_opts.pathlist.strings = (char **)filelist.contents;
+ i_opts.pathlist.count = filelist.length;
+
+ i_opts.start = "b";
+ i_opts.end = "k/D";
+
+ cl_git_pass(git_iterator_for_index(&i, index, &i_opts));
+ expect_iterator_items(i, 8, NULL, 8, NULL);
+ git_iterator_free(i);
+
+ git_index_free(index);
+ git_vector_free(&filelist);
+}
+
+void test_repo_iterator__indexfilelist_4(void)
+{
+ git_iterator *i;
+ git_iterator_options i_opts = GIT_ITERATOR_OPTIONS_INIT;
+ git_index *index;
+ git_vector filelist = GIT_VECTOR_INIT;
+
+ g_repo = cl_git_sandbox_init("icase");
+
+ cl_git_pass(git_repository_index(&index, g_repo));
+
+ cl_git_pass(git_vector_init(&filelist, 100, &git__strcmp_cb));
+ cl_git_pass(git_vector_insert(&filelist, "0"));
+ cl_git_pass(git_vector_insert(&filelist, "c"));
+ cl_git_pass(git_vector_insert(&filelist, "D"));
+ cl_git_pass(git_vector_insert(&filelist, "e"));
+ cl_git_pass(git_vector_insert(&filelist, "k"));
+ cl_git_pass(git_vector_insert(&filelist, "k.a"));
+ cl_git_pass(git_vector_insert(&filelist, "k.b"));
+ cl_git_pass(git_vector_insert(&filelist, "kZZZZZZZ"));
+
+ i_opts.pathlist.strings = (char **)filelist.contents;
+ i_opts.pathlist.count = filelist.length;
+
+ i_opts.start = "b";
+ i_opts.end = "k/D";
+
+ cl_git_pass(git_iterator_for_index(&i, index, &i_opts));
+ expect_iterator_items(i, 8, NULL, 8, NULL);
+ git_iterator_free(i);
+
+ git_index_free(index);
+ git_vector_free(&filelist);
+}
+
void test_repo_iterator__indexfilelist_icase(void)
{
git_iterator *i;