-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsamearchive-lite.cpp
More file actions
309 lines (276 loc) · 9.47 KB
/
samearchive-lite.cpp
File metadata and controls
309 lines (276 loc) · 9.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
/* ************************************************************************ *
* samearchive-lite - Reads the paths from the input and output the *
* identical files. This program is written for the *
* special case where each directory acts as an archive *
* or backup. This program is written for the special *
* case where each directory acts as an archive or *
* backup. The output will only contain filename pairs *
* that have the same relative path from the archive *
* base. *
* *
* This version uses a lot less memory then samefile and *
* is faster, but only find a partial set of identical *
* files. It basicaly does 80% of the job, but does this *
* in 50% of the time while using 10% of the resources *
* compared to samearchive. *
* *
* Example: find <dir1> | samearchive-lite <dir1> <dir2> [...] *
* ************************************************************************ *
* Written by Alex de Kruijff 21 April 2009 *
* ************************************************************************ *
* This source was written with a tabstop every four characters *
* In vi type :set ts=4 *
* ************************************************************************ */
static const char version[] = "$Id: samearchive-lite.cpp, v1.00 2009/04/14 00:00:00 akruijff Exp $\n";
#include "configure.h"
#include "toolkit.h"
#include <fcntl.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <unistd.h>
#ifdef STATIC_CACHE_CAPACITY
#include "hash.h"
#endif
typedef struct match_t {
struct stat s1;
struct stat s2;
int result;
// make sure match_t don't get out of scope!
match_t &operator=(match_t &t)
{
s1.st_ino = t.s1.st_ino, s1.st_dev = t.s1.st_dev;
s2.st_ino = t.s2.st_ino, s2.st_dev = t.s2.st_dev;
return *this;
}
} match_t;
#ifdef STATIC_CACHE_CAPACITY
static match_t cache[STATIC_CACHE_CAPACITY];
#else // STATIC_CACHE_CAPACITY
static match_t previous;
#endif // STATIC_CACHE_CAPACITY
#define VERBOSE_LEVEL1 1
#define VERBOSE_LEVEL2 2
#define VERBOSE_LEVEL3 3
#define VERBOSE_MAX 3
#define VERBOSE_MASK 3
#define S_VERBOSE(m) ((m) & VERBOSE_MASK)
#define S_VERBOSE_LEVEL1(m) (((m) & VERBOSE_MASK) >= VERBOSE_LEVEL1)
#define S_VERBOSE_LEVEL2(m) (((m) & VERBOSE_MASK) >= VERBOSE_LEVEL2)
#define S_VERBOSE_LEVEL3(m) (((m) & VERBOSE_MASK) >= VERBOSE_LEVEL3)
// Retrieved from processOptions
static char *program = NULL;
static const char *sep = "\t";
static size_t minSize = 0, maxSize = UINT_MAX;
static unsigned int flags = VERBOSE_LEVEL1;
static int eol = '\n';
/**
* Prints the usage of this program.
*/
static void usage()
{
fprintf(stderr, "\n");
fprintf(stderr, "%s read a list of filenames from stdin and\n", program);
fprintf(stderr, "archives (directories) from the paramters and\n");
fprintf(stderr, "writes a list of identical files on stdout.\n");
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [-g size] [-s sep] [-aqVv] <dir1> <dir2> [...]\n", program);
fprintf(stderr, "exampe: find <dir1> | %s [options] <dir1> <dir2> [...]\n", program);
fprintf(stderr, "\n");
fprintf(stderr, " Options: -g : only output files greater than size (0)\n");
fprintf(stderr, " -m : only output files less or equal than size (0)\n");
fprintf(stderr, " -q : suppress non-error messages\n");
fprintf(stderr, " -S : use sep as separator string for files (tab)\n");
fprintf(stderr, " -V : output version information and exit\n");
fprintf(stderr, " -v : more verbose output on stderr\n");
fprintf(stderr, "\n");
exit(EXIT_SUCCESS);
}
static int processOptions(int argc, char **argv)
{
(program = rindex(argv[0], '/')) ? ++program : program = argv[0];
int c;
while((c = getopt(argc, argv, "h?g:S:qVv")) != -1)
switch(c)
{
default: case 'h': case '?': usage(); break;
case '0': eol = 0; break;
case 'g':
if (sscanf(optarg, "%u", &minSize) != 1)
minSize = 0, fprintf(stderr,
"warning: can't convert -g %s, using -g 0 instead",
optarg);
break;
case 'm':
if (sscanf(optarg, "%u", &maxSize) != 1)
minSize = 0, fprintf(stderr,
"warning: can't convert -m %s, using -g 0 instead",
optarg);
break;
case 'S':sep = optarg; break;
case 'q': flags &= ~VERBOSE_MASK; break;
case 'V': printf(version); exit(EXIT_SUCCESS); break;
case 'v': if ((S_VERBOSE(flags)) < VERBOSE_MAX) ++flags; break;
}
return optind;
}
#ifdef STATIC_CACHE_CAPACITY
static hash_t key(const match_t &m)
{
return hashword((hash_t *)&m.s1.st_ino, sizeof(ino_t) / sizeof(hash_t),
hashword((hash_t *)&m.s2.st_ino, sizeof(ino_t) / sizeof(hash_t),
hashword((hash_t *)&m.s1.st_dev, sizeof(dev_t) / sizeof(hash_t),
hashword((hash_t *)&m.s2.st_dev, sizeof(dev_t) / sizeof(hash_t)
))));
}
#endif
/**
* Checks all files on the standerd input with there counter parts in the other
* archives.
*/
static void processInput(int argc, char **argv)
{
if (argc < 2)
usage();
size_t len0 = strlen(argv[0]);
if (argv[0][len0 - 1] == '/')
argv[0][--len0] = 0;
if (argv[1][strlen(argv[1]) - 1] == '/')
argv[1][strlen(argv[1]) - 1] = 0;
// filther out doubles
int *skip = new int[argc];
{
int counter = 0;
for (int i = 1; i < argc; ++i)
{
skip[i] = 0;
for (int j = 0; j < i; ++j)
if (!strcmp(argv[i], argv[j]))
{
skip[i] = 1;
++counter;
continue;
}
}
// Print usage if there are not at least two archives remaining
if (argc < 2)
{
usage();
delete[] skip;
exit(EXIT_SUCCESS);
}
}
// How much extra space do we need in f2 compaired to f1?
size_t diff = strlen(argv[1]) - strlen(argv[0]);
for (int i = 0; i < argc; ++i)
{
if (argv[i][strlen(argv[i]) - 1] == '/')
argv[i][strlen(argv[i])] = 0;
if (diff < strlen(argv[i]) - strlen(argv[0]))
diff = strlen(argv[i]) - strlen(argv[0]);
}
#ifdef STATIC_CACHE_CAPACITY
memset(cache, 0, STATIC_CACHE_CAPACITY * sizeof(match_t));
#else // STATIC_CACHE_CAPACITY
memset(&previous, 0, sizeof(match_t));
#endif // STATIC_CACHE_CAPACITY
int len;
size_t f1n = 256, f2n = 256;
char *f1 = new char[f1n], *f2 = new char[f2n];
// Shortcut: the first part of f2 is constant if argc == 2
if (argc == 2)
memcpy(f2, argv[1], len = strlen(argv[1]));
struct match_t m;
while(fgetline(f1, f1n, stdin, eol) != NULL)
{
// Skip unlinkble lines or do not start with argv[0]
if (lstat(f1, &m.s1) < 0 || (m.s1.st_mode & S_IFREG) == 0 ||
m.s1.st_size <= minSize || m.s1.st_size > maxSize)
continue;
// Skip lines that do not start with argv[0]
if (strlen(f1) < len0 || memcmp(f1, argv[0], len0))
{
fprintf(stderr, "Skipped %s because it didn't start with %s.\n", f1, argv[0]);
fprintf(stderr, "%s %i < %i\n", f1, strlen(f1), len0);
continue;
}
// check on all other archives
char *f = f1 + strlen(argv[0]);
for (int i = 1; i < argc; ++i)
{
// filther out doubles
if (skip[i])
continue;
// enlage f2 if needed
if (f2n < strlen(f1) + diff)
{
while ((f2n <<= 1) < strlen(f1) + diff);
delete[] f2;
f2 = new char[f2n];
if (argc == 2)
memcpy(f2, argv[1], len);
}
// f2 = argv[i] + f
if (argc > 2)
memcpy(f2, argv[i], len = strlen(argv[i]));
strcpy(f2 + len, f);
// Skip unlinkble lines, diffent sizes or same file
if (lstat(f2, &m.s2) || (m.s2.st_mode & S_IFREG == 0) ||
m.s1.st_dev == m.s2.st_dev && m.s1.st_ino == m.s2.st_ino ||
m.s1.st_size != m.s2.st_size)
{
continue;
}
// check f1 with f2
#ifdef STATIC_CACHE_CAPACITY
hash_t k = key(m) % STATIC_CACHE_CAPACITY;
switch(cache[k].s1.st_ino == m.s1.st_ino &&
cache[k].s2.st_ino == m.s2.st_ino &&
cache[k].s1.st_dev == m.s1.st_dev &&
cache[k].s2.st_dev == m.s2.st_dev &&
cache[k].s1.st_ino != 0 ? cache[k].result :
(cache[k] = m).result = fcmp(f1, f2, m.s1, m.s2))
#else // STATIC_CACHE_CAPACITY
switch(previous.s1.st_ino == m.s1.st_ino &&
previous.s2.st_ino == m.s2.st_ino &&
previous.s1.st_dev == m.s1.st_dev &&
previous.s2.st_dev == m.s2.st_dev &&
previous.s1.st_ino != 0 ? previous.result :
(previous = m).result = fcmp(f1, f2, m.s1, m.s2))
#endif // STATIC_CACHE_CAPACITY
{
case FILE_OPEN1_ERROR:
if (S_VERBOSE_LEVEL1(flags))
fprintf(stderr, "inaccessible: %s\n", f1);
break;
case FILE_OPEN2_ERROR:
if (S_VERBOSE_LEVEL1(flags))
fprintf(stderr, "inaccessible: %s\n", f2);
break;
case FILE_READ1_ERROR:
if (S_VERBOSE_LEVEL1(flags))
fprintf(stderr, "unreadable: %s\n", f1);
break;
case FILE_READ2_ERROR:
if (S_VERBOSE_LEVEL1(flags))
fprintf(stderr, "unreadable: %s\n", f2);
break;
case FILE_IDENTICAL:
outputSamefile(f1, f2, m.s1.st_nlink, m.s2.st_nlink,
m.s1.st_size, m.s1.st_dev == m.s2.st_dev, sep);
break;
}
}
}
delete[] skip;
delete[] f1;
delete[] f2;
}
int main(int argc, char **argv)
{
int offset = processOptions(argc, argv);
processInput(argc - offset, argv + offset);
return 0;
}