szaydel
1/18/2019 - 3:16 PM

Scan filesystem structure for names containing non-ASCII characters

The goal of this program is to identify paths of files or directories which possess non-ASCII characters, because these files or directories will not work correctly over AFP, SMB, etc.

$ ./ns ../test-special-names/
2019-01-18 07:07:59        64 B   ../test-special-names/defথabc2/
2019-01-18 07:07:59        64 B   ../test-special-names/defথabc4/
2019-01-18 07:07:59        64 B   ../test-special-names/defথabc3/
2019-01-18 07:07:59         0 B   ../test-special-names/abc۞def1.txt
2019-01-18 07:07:59         0 B   ../test-special-names/abc۞def0.txt
2019-01-18 07:07:59         0 B   ../test-special-names/abc۞def2.txt
2019-01-18 07:07:59         0 B   ../test-special-names/abc۞def3.txt
2019-01-18 07:07:59        64 B   ../test-special-names/defথabc1/
/* We want POSIX.1-2008 + XSI, i.e. SuSv4, features */
#define _XOPEN_SOURCE 700

/* Added on 2017-06-25:
   If the C library can support 64-bit file sizes
   and offsets, using the standard names,
   these defines tell the C library to do so. */
#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64

#include <stdlib.h>
#include <unistd.h>
#include <ftw.h>
#include <time.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <errno.h>
#include <limits.h>

/* POSIX.1 says each process has at least 20 file descriptors.
 * Three of those belong to the standard streams.
 * Here, we use a conservative estimate of 15 available;
 * assuming we use at most two for other uses in this program,
 * we should never run into any problems.
 * Most trees are shallower than that, so it is efficient.
 * Deeper trees are traversed fine, just a bit slower.
 * (Linux allows typically hundreds to thousands of open files,
 *  so you'll probably never see any issues even if you used
 *  a much higher value, say a couple of hundred, but
 *  15 is a safe, reasonable value.)
*/
#ifndef USE_FDS
#define USE_FDS 15
#endif

// This is a hack but it is good enough for testing and to work on systems
// I know this is going to work on, but not portable really.
#ifndef MAXPATHLEN
#define MAXPATHLEN 4096
#endif

#define NOK 0
#define OK 1

// '/' is not valid per spec, but we are not including it here because we are
// dealing with absolute paths, which must have '/' to separate components.
// This is obviously why it is not a valid character in name of any 
// individual component.
// According to spec, these are characters we do not allow. "\/[]:+|<>=;?,*
static const unsigned lookup[128] = {
    [34] = 1, // '"'
    [42] = 1, // '*'
    [43] = 1, // '+'
    [44] = 1, // ','
    [58] = 1, // ':'
    [59] = 1, // ';'
    [60] = 1, // '<'
    [61] = 1, // '='
    [62] = 1, // '>'
    [63] = 1, // '?'
    [91] = 1, // '['
    [92] = 1, // '\'
    [93] = 1, // ']'
    [124] = 1, // '|'
};

bool dry_run = false;

#define isascii(c) (!(((int)(c)) & ~0177))
int allowed(unsigned char c) {

    if ((c < 32) || (c > 127)) return NOK;
    if (!isascii(c)) return NOK;
    if (lookup[c] == 1) return NOK;
    
    return OK;
}

int process_entry(const char *filepath, const struct stat *info,
                const int typeflag, struct FTW *pathinfo)
{
    char new_filepath[MAXPATHLEN];
    bool path_ok = true;
    size_t idx_fixed = 0;
    
    // Walk the path, and build a copy of the path in new_filepath, but skip
    // any character which is not in ASCII. Result should be a new sensible
    // path.
    for (size_t i = 0 ; filepath[i] != '\0' && i < MAXPATHLEN ; i++) {
        if (!allowed(filepath[i])) {
            path_ok = false;
            continue;
        }
        new_filepath[idx_fixed] = filepath[i];
        idx_fixed++;
    }
    if (path_ok) return 0;
    
    // Add a null byte to make sure we have a proper string
    new_filepath[idx_fixed] = '\0';
    
    /* const char *const filename = filepath + pathinfo->base; */
    const double bytes = (double)info->st_size; /* Not exact if large! */
    struct tm mtime;

    localtime_r(&(info->st_mtime), &mtime);

    printf("%04d-%02d-%02d %02d:%02d:%02d",
           mtime.tm_year+1900, mtime.tm_mon+1, mtime.tm_mday,
           mtime.tm_hour, mtime.tm_min, mtime.tm_sec);

    if (bytes >= 1099511627776.0)
        printf(" %9.3f TiB", bytes / 1099511627776.0);
    else
    if (bytes >= 1073741824.0)
        printf(" %9.3f GiB", bytes / 1073741824.0);
    else
    if (bytes >= 1048576.0)
        printf(" %9.3f MiB", bytes / 1048576.0);
    else
    if (bytes >= 1024.0)
        printf(" %9.3f KiB", bytes / 1024.0);
    else
        printf(" %9.0f B  ", bytes);

    // Symbolic link case
    if (typeflag == FTW_SL) {
        char   *target;
        size_t  maxlen = 1023;
        ssize_t len;

        while (1) {
            target = malloc(maxlen + 1);
            if (target == NULL)
                return ENOMEM;

            len = readlink(filepath, target, maxlen);
            if (len == (ssize_t)-1) {
                const int saved_errno = errno;
                free(target);
                return saved_errno;
            }
            if (len >= (ssize_t)maxlen) {
                free(target);
                maxlen += 1024;
                continue;
            }

            target[len] = '\0';
            break;
        }

        printf(" %s -> %s", filepath, target);
        free(target);

    } else
    if (typeflag == FTW_SLN)
        printf(" %s (dangling symlink)", filepath);
    else
    if (typeflag == FTW_F)
        printf(" %s", filepath);
    else
    if (typeflag == FTW_D || typeflag == FTW_DP)
        printf(" %s/", filepath);
    else
    if (typeflag == FTW_DNR)
        printf(" %s/ (unreadable)", filepath);
    else
        printf(" %s (unknown)", filepath);

    
    if (typeflag == FTW_DNR) printf(" (skipping stale dir)\n");
    else {
        if (dry_run) {
            printf(" [rename] -> %s\n", new_filepath);
        } else {
            int failed_rename = rename(filepath, new_filepath);
            printf(" (%s)\n", !failed_rename ? "Fixed" : "Not Fixed");
        }
    }
    return 0;
}


int print_directory_tree(const char *const dirpath)
{
    int result;

    /* Invalid directory path? */
    if (dirpath == NULL || *dirpath == '\0')
        return errno = EINVAL;

    result = nftw(dirpath, process_entry, USE_FDS, FTW_PHYS);
    if (result >= 0)
        errno = result;

    return errno;
}

int main(int argc, char *argv[])
{
    if (argc < 2) {

        if (print_directory_tree(".")) {
            fprintf(stderr, "%s.\n", strerror(errno));
            return EXIT_FAILURE;
        }

    } else {
        for (int arg = 1; arg < argc; arg++) {
            if (!strcmp(argv[arg], "-n")) {
                dry_run = true;
            }
        }

        for (int arg = 1; arg < argc; arg++) {
            if (dry_run && arg == 1) continue;
            if (print_directory_tree(argv[arg])) {
                fprintf(stderr, "%s.\n", strerror(errno));
                return EXIT_FAILURE;
            }
        }

    }

    return EXIT_SUCCESS;
}