2#include <apr_strings.h>
17const void *ft_fsize_get_key(
const void *opaque);
18const void *ft_gids_get_key(
const void *opaque);
20int ft_file_cmp(
const void *param1,
const void *param2)
22 const ft_file_t *file1 = (
const ft_file_t *) param1;
23 const ft_file_t *file2 = (
const ft_file_t *) param2;
25 if (file1->size < file2->size) {
28 if (file2->size < file1->size) {
36static const char *
const default_ignores[] = {
38 ".git/",
".hg/",
".svn/",
40 "build/",
"dist/",
"out/",
"target/",
"bin/",
41 "*.o",
"*.class",
"*.pyc",
"*.pyo",
43 "node_modules/",
"vendor/",
".venv/",
45 ".DS_Store",
"Thumbs.db",
"*.swp",
"*~",
".idea/",
".vscode/",
49static apr_status_t ft_pcre_free_cleanup(
void *pcre_space)
51 pcre_free(pcre_space);
55static pcre *ft_pcre_compile(
const char *regex,
int caseless, apr_pool_t *pool)
57 const char *errptr = NULL;
59 int options = PCRE_DOLLAR_ENDONLY | PCRE_DOTALL;
63 options |= PCRE_CASELESS;
66 result = pcre_compile(regex, options, &errptr, &erroffset, NULL);
68 DEBUG_ERR(
"can't parse %s at [%.*s] for -e / --regex-ignore-file: %s", regex, erroffset, regex, errptr);
71 apr_pool_cleanup_register(pool, result, ft_pcre_free_cleanup, apr_pool_cleanup_null);
78static const int MAX_GIDS = 256;
80static apr_status_t fill_gids_ht(
const char *username, napr_hash_t *gids, apr_pool_t *pool)
83 apr_uint32_t hash_value = 0;
86 memset(list, 0,
sizeof(list));
87 nb_gid = getgroups((
int) (
sizeof(list) /
sizeof(gid_t)), list);
98 if (nb_gid < (
sizeof(list) /
sizeof(gid_t))) {
99 list[nb_gid] = getegid();
103 for (
int idx = 0; idx < nb_gid; idx++) {
104 ft_gid_t *gid =
napr_hash_search(gids, &(list[idx]),
sizeof(gid_t), &hash_value);
106 gid = apr_palloc(pool,
sizeof(
struct ft_gid_t));
107 gid->val = list[idx];
115static void ft_hash_add_ignore_list(napr_hash_t *hash,
const char *file_list)
117 const char *filename = NULL;
118 const char *end = NULL;
119 apr_uint32_t hash_value = 0;
120 apr_pool_t *pool = NULL;
124 filename = file_list;
126 end = strchr(filename,
',');
128 tmp = apr_pstrndup(pool, filename, end - filename);
131 tmp = apr_pstrdup(pool, filename);
139 }
while ((NULL != end) && (
'\0' != *filename));
142static void ft_load_defaults(
ft_conf_t *conf)
144 for (
int idx = 0; default_ignores[idx] != NULL; idx++) {
149static void version(
void)
151 (void) fprintf(stdout, PACKAGE_STRING
"\n");
152 (void) fprintf(stdout,
"Copyright (C) 2007 François Pesce\n");
153 (void) fprintf(stdout,
"Licensed under the Apache License, Version 2.0 (the \"License\");\n");
154 (void) fprintf(stdout,
"you may not use this file except in compliance with the License.\n");
155 (void) fprintf(stdout,
"You may obtain a copy of the License at\n");
156 (void) fprintf(stdout,
"\n");
157 (void) fprintf(stdout,
"\thttp://www.apache.org/licenses/LICENSE-2.0\n");
158 (void) fprintf(stdout,
"\n");
159 (void) fprintf(stdout,
"Unless required by applicable law or agreed to in writing, software\n");
160 (void) fprintf(stdout,
"distributed under the License is distributed on an \"AS IS\" BASIS,\n");
161 (void) fprintf(stdout,
"WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n");
162 (void) fprintf(stdout,
"See the License for the specific language governing permissions and\n");
163 (void) fprintf(stdout,
"limitations under the License.\n\n");
164 (void) fprintf(stdout,
"Report bugs to " PACKAGE_BUGREPORT
"\n");
167static void usage(
const char *name,
const apr_getopt_option_t *opt_option)
169 (void) fprintf(stdout, PACKAGE_STRING
"\n");
170 (void) fprintf(stdout,
"Usage: %s [OPTION]... [FILES or DIRECTORIES]...\n", name);
171 (void) fprintf(stdout,
"Find identical files passed as parameter or recursively found in directories.\n");
172 (void) fprintf(stdout,
"\n");
173 (void) fprintf(stdout,
"Mandatory arguments to long options are mandatory for short options too.\n");
174 (void) fprintf(stdout,
"\n");
176 for (
int idx = 0; NULL != opt_option[idx].name; idx++) {
177 (void) fprintf(stdout,
"-%c,\t--%s\t%s\n", opt_option[idx].optch, opt_option[idx].name, opt_option[idx].description);
181static void print_usage_and_exit(
const char *name,
const apr_getopt_option_t *opt_option,
const char *error_msg,
185 (void) fprintf(stderr,
"Error: %s %s\n\n", error_msg, arg);
187 usage(name, opt_option);
191static const int HASH_STR_BUCKET_SIZE = 32;
192static const int HASH_STR_MAX_ENTRIES = 8;
193static const int HASH_SIZE_BUCKET_SIZE = 4096;
194static const int HASH_SIZE_MAX_ENTRIES = 8;
195static const apr_off_t EXCESS_SIZE_DEFAULT = (50LL * 1024 * 1024);
197ft_conf_t *ft_config_create(apr_pool_t *pool)
199 apr_uint32_t hash_value = 0;
204 conf->ig_files =
napr_hash_str_make(pool, HASH_STR_BUCKET_SIZE, HASH_STR_MAX_ENTRIES);
205 conf->sizes =
napr_hash_make(pool, HASH_SIZE_BUCKET_SIZE, HASH_SIZE_MAX_ENTRIES, ft_fsize_get_key,
206 ft_fsize_get_key_len, apr_off_t_key_cmp, apr_off_t_key_hash);
207 conf->gids =
napr_hash_make(pool, HASH_SIZE_BUCKET_SIZE, HASH_SIZE_MAX_ENTRIES, ft_gids_get_key, ft_gid_get_key_len,
208 gid_t_key_cmp, gid_t_key_hash);
216 conf->ig_regex = NULL;
217 conf->wl_regex = NULL;
218 conf->ar_regex = NULL;
220 conf->p_path_len = 0;
224 conf->excess_size = (apr_off_t) EXCESS_SIZE_DEFAULT;
226 conf->respect_gitignore = 1;
228 ft_load_defaults(conf);
229 conf->mask = OPTION_RECSD;
230 conf->threshold = PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD;
235static const double DEFAULT_THRESHOLD = 0.5;
237static const apr_getopt_option_t opt_option[] = {
238 {
"hidden",
'a', FALSE,
"do not ignore hidden files."},
239 {
"case-unsensitive",
'c', FALSE,
"this option applies to regex match."},
240 {
"display-size",
'd', FALSE,
"\tdisplay size before duplicates (human-readable)."},
241 {
"dry-run",
'n', FALSE,
"\tonly print the operations that would be done."},
242 {
"regex-ignore-file",
'e', TRUE,
"filenames that match this are ignored."},
243 {
"follow-symlink",
'f', FALSE,
"follow symbolic links."},
244 {
"help",
'h', FALSE,
"\t\tdisplay usage."},
245 {
"image-cmp",
'I', FALSE,
"\twill run ftwin in image cmp mode (using libpuzzle)."},
246 {
"image-threshold",
'T', TRUE,
247 "will change the image similarity threshold\n\t\t\t\t (default is [1], accepted [2/3/4/5])."},
248 {
"ignore-list",
'i', TRUE,
"\tcomma-separated list of file names to ignore."},
250 {
"json",
'J', FALSE,
"\t\toutput results in machine-readable JSON format."},
252 {
"minimal-length",
'm', TRUE,
"minimum size of file to process."},
253 {
"max-size",
'M', TRUE,
"maximum size of file to process."},
254 {
"optimize-memory",
'o', FALSE,
"reduce memory usage, but increase process time."},
255 {
"priority-path",
'p', TRUE,
"\tfile in this path are displayed first when\n\t\t\t\tduplicates are reported."},
256 {
"recurse-subdir",
'r', FALSE,
"recurse subdirectories (default: on)."},
257 {
"no-recurse",
'R', FALSE,
"do not recurse in subdirectories."},
258 {
"separator",
's', TRUE,
"\tseparator character between twins, default: \\n."},
259 {
"tar-cmp",
't', FALSE,
"\twill process files archived in .tar default: off."},
260 {
"threads",
'j', TRUE,
"\tnumber of threads for parallel hashing (default: CPU cores)."},
261 {
"verbose",
'v', FALSE,
"\tdisplay a progress bar."},
262 {
"version",
'V', FALSE,
"\tdisplay version."},
263 {
"whitelist-regex-file",
'w', TRUE,
"filenames that doesn't match this are ignored."},
264 {
"excessive-size",
'x', TRUE,
"excessive size of file that switch off mmap use."},
293static void handle_flag_option(
int option,
ft_conf_t *conf);
294static void handle_string_option(
int option,
const char *optarg,
ft_conf_t *conf,
struct regex_options *opts);
295static void handle_numeric_option(
int option,
const char *optarg,
ft_conf_t *conf,
const char *name,
296 const apr_getopt_option_t *opt_option);
297static void handle_special_option(
int option,
const char *optarg,
ft_conf_t *conf,
struct regex_options *opts,
298 const char *name,
const apr_getopt_option_t *opt_option);
300static void process_options(
int option,
const char *optarg,
ft_conf_t *conf,
struct regex_options *opts,
const char *name)
313 handle_flag_option(option, conf);
322 handle_string_option(option, optarg, conf, opts);
330 handle_numeric_option(option, optarg, conf, name, opt_option);
340 handle_special_option(option, optarg, conf, opts, name, opt_option);
350 {
'a', OPTION_SHOW_HIDDEN, 1},
351 {
'c', OPTION_ICASE, 1},
352 {
'd', OPTION_SIZED, 1},
353 {
'n', OPTION_DRY_RUN, 1},
354 {
'f', OPTION_FSYML, 1},
355 {
'o', OPTION_OPMEM, 1},
356 {
'r', OPTION_RECSD, 1},
357 {
'R', OPTION_RECSD, 0}
360static void handle_flag_option(
int option,
ft_conf_t *conf)
362 for (
size_t idx = 0; idx <
sizeof(flag_mappings) /
sizeof(flag_mappings[0]); ++idx) {
363 if (flag_mappings[idx].option_char == option) {
364 set_option(&conf->mask, flag_mappings[idx].option_flag, flag_mappings[idx].value);
371 if (!is_option_set(conf->mask, OPTION_JSON)) {
372 set_option(&conf->mask, OPTION_VERBO, 1);
377static void handle_string_option(
int option,
const char *optarg,
ft_conf_t *conf,
struct regex_options *opts)
381 *(opts->
ignore_regex) = apr_pstrdup(conf->pool, optarg);
384 ft_hash_add_ignore_list(conf->ig_files, optarg);
387 conf->p_path = apr_pstrdup(conf->pool, optarg);
388 conf->p_path_len = strlen(conf->p_path);
402static void handle_numeric_option(
int option,
const char *optarg,
ft_conf_t *conf,
const char *name,
403 const apr_getopt_option_t *opt_option)
408 long threads = strtol(optarg, &endptr, BASE_TEN);
409 if (*endptr !=
'\0' || threads < 1 || threads > MAX_THREADS) {
410 print_usage_and_exit(name, opt_option,
"Invalid number of threads (must be 1-256):", optarg);
412 conf->num_threads = (
unsigned int) threads;
417 if (conf->minsize < 0) {
418 print_usage_and_exit(name, opt_option,
"Invalid size for --minimal-length:", optarg);
423 if (conf->maxsize < 0) {
424 print_usage_and_exit(name, opt_option,
"Invalid size for --max-size:", optarg);
428 conf->excess_size = (apr_off_t) strtoul(optarg, NULL, BASE_TEN);
429 if (ULONG_MAX == conf->minsize) {
430 print_usage_and_exit(name, opt_option,
"can't parse for -x / --excessive-size", optarg);
445static void handle_image_options(
int option,
const char *optarg,
ft_conf_t *conf,
char **wregex,
const char *name,
446 const apr_getopt_option_t *opt_option)
450 set_option(&conf->mask, OPTION_ICASE, 1);
451 set_option(&conf->mask, OPTION_PUZZL, 1);
452 *wregex = apr_pstrdup(conf->pool,
".*\\.(gif|png|jpe?g)$");
457 conf->threshold = PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD;
460 conf->threshold = PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD;
463 conf->threshold = DEFAULT_THRESHOLD;
466 conf->threshold = PUZZLE_CVEC_SIMILARITY_THRESHOLD;
469 conf->threshold = PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD;
472 print_usage_and_exit(name, opt_option,
"invalid threshold:", optarg);
481static void handle_special_option(
int option,
const char *optarg,
ft_conf_t *conf,
struct regex_options *opts,
482 const char *name,
const apr_getopt_option_t *opt_option)
486 usage(name, opt_option);
493 handle_image_options(option, optarg, conf, opts->
whitelist_regex, name, opt_option);
497 set_option(&conf->mask, OPTION_JSON, 1);
498 if (is_option_set(conf->mask, OPTION_VERBO)) {
499 (void) fprintf(stderr,
"Warning: Verbose mode disabled for JSON output.\n");
500 set_option(&conf->mask, OPTION_VERBO, 0);
505 set_option(&conf->mask, OPTION_UNTAR, 1);
506 *(opts->
archive_regex) = apr_pstrdup(conf->pool,
".*\\.(tar\\.gz|tgz|tar\\.bz2|tbz2|tar\\.xz|txz|zip|rar|7z|tar)$");
514apr_status_t ft_config_parse_args(
ft_conf_t *conf,
int argc,
const char **argv,
int *first_arg_index)
516 char errbuf[ERROR_BUFFER_SIZE];
517 char *regex_str = NULL;
518 char *wregex_str = NULL;
519 char *arregex_str = NULL;
520 struct regex_options opts = { ®ex_str, &wregex_str, &arregex_str };
521 apr_getopt_t *opt_state = NULL;
522 const char *optarg = NULL;
524 apr_status_t status = APR_SUCCESS;
526 memset(errbuf, 0,
sizeof(errbuf));
527 status = apr_getopt_init(&opt_state, conf->pool, argc, argv);
528 if (APR_SUCCESS != status) {
529 DEBUG_ERR(
"error calling apr_getopt_init: %s", apr_strerror(status, errbuf, ERROR_BUFFER_SIZE));
533 while (APR_SUCCESS == (status = apr_getopt_long(opt_state, opt_option, &option, &optarg))) {
534 process_options(option, optarg, conf, &opts, argv[0]);
537 status = apr_uid_current(&(conf->userid), &(conf->groupid), conf->pool);
538 if (APR_SUCCESS != status) {
539 DEBUG_ERR(
"error calling apr_uid_current: %s", apr_strerror(status, errbuf, ERROR_BUFFER_SIZE));
543 status = apr_uid_name_get(&(conf->username), conf->userid, conf->pool);
544 if (APR_SUCCESS != status) {
545 DEBUG_ERR(
"error calling apr_uid_name_get: %s", apr_strerror(status, errbuf, ERROR_BUFFER_SIZE));
549 status = fill_gids_ht(conf->username, conf->gids, conf->pool);
550 if (APR_SUCCESS != status) {
551 DEBUG_ERR(
"error calling fill_gids_ht: %s", apr_strerror(status, errbuf, ERROR_BUFFER_SIZE));
555 if (NULL != regex_str) {
556 conf->ig_regex = ft_pcre_compile(regex_str, is_option_set(conf->mask, OPTION_ICASE), conf->pool);
557 if (NULL == conf->ig_regex) {
562 if (NULL != wregex_str) {
563 conf->wl_regex = ft_pcre_compile(wregex_str, is_option_set(conf->mask, OPTION_ICASE), conf->pool);
564 if (NULL == conf->wl_regex) {
569 if (NULL != arregex_str) {
570 conf->ar_regex = ft_pcre_compile(arregex_str, is_option_set(conf->mask, OPTION_ICASE), conf->pool);
571 if (NULL == conf->ar_regex) {
577 if (first_arg_index != NULL) {
578 *first_arg_index = opt_state->ind;
UTIL debug output macros.
#define DEBUG_ERR(str, arg...)
Display error message at the level error.
apr_status_t ft_ignore_add_pattern_str(ft_ignore_context_t *ctx, const char *pattern_str)
Adds a single pattern string to a context.
ft_ignore_context_t * ft_ignore_context_create(apr_pool_t *pool, ft_ignore_context_t *parent, const char *base_dir)
Creates a new ignore context.
unsigned int ft_get_cpu_cores(void)
Get the number of available CPU cores on the current system.
System-related utility functions.
apr_off_t parse_human_size(const char *size_str)
Parses a human-readable size string (e.g., "10M", "2.5G") into bytes.
Utilities for parsing and formatting human-readable file sizes.
apr_pool_t * napr_hash_pool_get(const napr_hash_t *thehash)
Get a pointer to the pool from which the hash table was allocated.
napr_hash_t * napr_hash_str_make(apr_pool_t *pool, apr_size_t nel, apr_size_t ffactor)
Create a hash table optimized for storing C strings as keys.
apr_status_t napr_hash_set(napr_hash_t *hash, void *data, apr_uint32_t hash_value)
Inserts or updates an item in the hash table.
void * napr_hash_search(napr_hash_t *hash, const void *key, apr_size_t key_len, apr_uint32_t *hash_value)
Searches the hash table for an item.
napr_hash_t * napr_hash_make(apr_pool_t *pool, apr_size_t nel, apr_size_t ffactor, get_key_callback_fn_t get_key, get_key_len_callback_fn_t get_key_len, key_cmp_callback_fn_t key_cmp, hash_callback_fn_t hash)
Create a hash table with custom key handling and hashing functions.
napr_heap_t * napr_heap_make(apr_pool_t *pool, napr_heap_cmp_callback_fn_t *cmp)
Creates a new heap.
Maps a command-line option character to its corresponding flag and value.
int value
The value to set (1 for on, 0 for off).
int option_char
The single-character option, e.g., 'a'.
int option_flag
The flag to set, e.g., OPTION_SHOW_HIDDEN.
Main configuration structure for the ftwin application.
A structure to hold pointers to the various regex string options.
char ** ignore_regex
Pointer to the ignore regex string.
char ** archive_regex
Pointer to the archive regex string.
char ** whitelist_regex
Pointer to the whitelist regex string.