| /* |
| * Copyright (c) Facebook, Inc. |
| * All rights reserved. |
| * |
| * This source code is licensed under both the BSD-style license (found in the |
| * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
| * in the COPYING file in the root directory of this source tree). |
| * You may select, at your option, one of the above-listed licenses. |
| */ |
| |
| #include "data.h" |
| |
| #include <assert.h> |
| #include <errno.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <stdlib.h> /* free() */ |
| |
| #include <sys/stat.h> |
| |
| #include <curl/curl.h> |
| |
| #include "mem.h" |
| #include "util.h" |
| #define XXH_STATIC_LINKING_ONLY |
| #include "xxhash.h" |
| |
| /** |
| * Data objects |
| */ |
| |
| #define REGRESSION_RELEASE(x) \ |
| "https://github.com/facebook/zstd/releases/download/regression-data/" x |
| |
| data_t silesia = { |
| .name = "silesia", |
| .type = data_type_dir, |
| .data = |
| { |
| .url = REGRESSION_RELEASE("silesia.tar.zst"), |
| .xxhash64 = 0x48a199f92f93e977LL, |
| }, |
| }; |
| |
| data_t silesia_tar = { |
| .name = "silesia.tar", |
| .type = data_type_file, |
| .data = |
| { |
| .url = REGRESSION_RELEASE("silesia.tar.zst"), |
| .xxhash64 = 0x48a199f92f93e977LL, |
| }, |
| }; |
| |
| data_t github = { |
| .name = "github", |
| .type = data_type_dir, |
| .data = |
| { |
| .url = REGRESSION_RELEASE("github.tar.zst"), |
| .xxhash64 = 0xa9b1b44b020df292LL, |
| }, |
| .dict = |
| { |
| .url = REGRESSION_RELEASE("github.dict.zst"), |
| .xxhash64 = 0x1eddc6f737d3cb53LL, |
| |
| }, |
| }; |
| |
| data_t github_tar = { |
| .name = "github.tar", |
| .type = data_type_file, |
| .data = |
| { |
| .url = REGRESSION_RELEASE("github.tar.zst"), |
| .xxhash64 = 0xa9b1b44b020df292LL, |
| }, |
| .dict = |
| { |
| .url = REGRESSION_RELEASE("github.dict.zst"), |
| .xxhash64 = 0x1eddc6f737d3cb53LL, |
| |
| }, |
| }; |
| |
| static data_t* g_data[] = { |
| &silesia, |
| &silesia_tar, |
| &github, |
| &github_tar, |
| NULL, |
| }; |
| |
| data_t const* const* data = (data_t const* const*)g_data; |
| |
| /** |
| * data helpers. |
| */ |
| |
| int data_has_dict(data_t const* data) { |
| return data->dict.url != NULL; |
| } |
| |
| /** |
| * data buffer helper functions (documented in header). |
| */ |
| |
| data_buffer_t data_buffer_create(size_t const capacity) { |
| data_buffer_t buffer = {}; |
| |
| buffer.data = (uint8_t*)malloc(capacity); |
| if (buffer.data == NULL) |
| return buffer; |
| buffer.capacity = capacity; |
| return buffer; |
| } |
| |
| data_buffer_t data_buffer_read(char const* filename) { |
| data_buffer_t buffer = {}; |
| |
| uint64_t const size = UTIL_getFileSize(filename); |
| if (size == UTIL_FILESIZE_UNKNOWN) { |
| fprintf(stderr, "unknown size for %s\n", filename); |
| return buffer; |
| } |
| |
| buffer.data = (uint8_t*)malloc(size); |
| if (buffer.data == NULL) { |
| fprintf(stderr, "malloc failed\n"); |
| return buffer; |
| } |
| buffer.capacity = size; |
| |
| FILE* file = fopen(filename, "rb"); |
| if (file == NULL) { |
| fprintf(stderr, "file null\n"); |
| goto err; |
| } |
| buffer.size = fread(buffer.data, 1, buffer.capacity, file); |
| fclose(file); |
| if (buffer.size != buffer.capacity) { |
| fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity); |
| goto err; |
| } |
| |
| return buffer; |
| err: |
| free(buffer.data); |
| memset(&buffer, 0, sizeof(buffer)); |
| return buffer; |
| } |
| |
| data_buffer_t data_buffer_get_data(data_t const* data) { |
| data_buffer_t const kEmptyBuffer = {}; |
| |
| if (data->type != data_type_file) |
| return kEmptyBuffer; |
| |
| return data_buffer_read(data->data.path); |
| } |
| |
| data_buffer_t data_buffer_get_dict(data_t const* data) { |
| data_buffer_t const kEmptyBuffer = {}; |
| |
| if (!data_has_dict(data)) |
| return kEmptyBuffer; |
| |
| return data_buffer_read(data->dict.path); |
| } |
| |
| int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) { |
| size_t const size = |
| buffer1.size < buffer2.size ? buffer1.size : buffer2.size; |
| int const cmp = memcmp(buffer1.data, buffer2.data, size); |
| if (cmp != 0) |
| return cmp; |
| if (buffer1.size < buffer2.size) |
| return -1; |
| if (buffer1.size == buffer2.size) |
| return 0; |
| assert(buffer1.size > buffer2.size); |
| return 1; |
| } |
| |
| void data_buffer_free(data_buffer_t buffer) { |
| free(buffer.data); |
| } |
| |
| /** |
| * data filenames helpers. |
| */ |
| |
| FileNamesTable* data_filenames_get(data_t const* data) |
| { |
| char const* const path = data->data.path; |
| return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ ); |
| } |
| |
| /** |
| * data buffers helpers. |
| */ |
| |
| data_buffers_t data_buffers_get(data_t const* data) { |
| data_buffers_t buffers = {.size = 0}; |
| FileNamesTable* const filenames = data_filenames_get(data); |
| if (filenames == NULL) return buffers; |
| if (filenames->tableSize == 0) { |
| UTIL_freeFileNamesTable(filenames); |
| return buffers; |
| } |
| |
| data_buffer_t* buffersPtr = |
| (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr)); |
| if (buffersPtr == NULL) { |
| UTIL_freeFileNamesTable(filenames); |
| return buffers; |
| } |
| buffers.buffers = (data_buffer_t const*)buffersPtr; |
| buffers.size = filenames->tableSize; |
| |
| for (size_t i = 0; i < filenames->tableSize; ++i) { |
| buffersPtr[i] = data_buffer_read(filenames->fileNames[i]); |
| if (buffersPtr[i].data == NULL) { |
| data_buffers_t const kEmptyBuffer = {}; |
| data_buffers_free(buffers); |
| UTIL_freeFileNamesTable(filenames); |
| return kEmptyBuffer; |
| } |
| } |
| |
| UTIL_freeFileNamesTable(filenames); |
| return buffers; |
| } |
| |
| /** |
| * Frees the data buffers. |
| */ |
| void data_buffers_free(data_buffers_t buffers) { |
| free((data_buffer_t*)buffers.buffers); |
| } |
| |
| /** |
| * Initialization and download functions. |
| */ |
| |
| static char* g_data_dir = NULL; |
| |
| /* mkdir -p */ |
| static int ensure_directory_exists(char const* indir) { |
| char* const dir = strdup(indir); |
| char* end = dir; |
| int ret = 0; |
| if (dir == NULL) { |
| ret = EINVAL; |
| goto out; |
| } |
| do { |
| /* Find the next directory level. */ |
| for (++end; *end != '\0' && *end != '/'; ++end) |
| ; |
| /* End the string there, make the directory, and restore the string. */ |
| char const save = *end; |
| *end = '\0'; |
| int const isdir = UTIL_isDirectory(dir); |
| ret = mkdir(dir, S_IRWXU); |
| *end = save; |
| /* Its okay if the directory already exists. */ |
| if (ret == 0 || (errno == EEXIST && isdir)) |
| continue; |
| ret = errno; |
| fprintf(stderr, "mkdir() failed\n"); |
| goto out; |
| } while (*end != '\0'); |
| |
| ret = 0; |
| out: |
| free(dir); |
| return ret; |
| } |
| |
| /** Concatenate 3 strings into a new buffer. */ |
| static char* cat3(char const* str1, char const* str2, char const* str3) { |
| size_t const size1 = strlen(str1); |
| size_t const size2 = strlen(str2); |
| size_t const size3 = str3 == NULL ? 0 : strlen(str3); |
| size_t const size = size1 + size2 + size3 + 1; |
| char* const dst = (char*)malloc(size); |
| if (dst == NULL) |
| return NULL; |
| strcpy(dst, str1); |
| strcpy(dst + size1, str2); |
| if (str3 != NULL) |
| strcpy(dst + size1 + size2, str3); |
| assert(strlen(dst) == size1 + size2 + size3); |
| return dst; |
| } |
| |
| static char* cat2(char const* str1, char const* str2) { |
| return cat3(str1, str2, NULL); |
| } |
| |
| /** |
| * State needed by the curl callback. |
| * It takes data from curl, hashes it, and writes it to the file. |
| */ |
| typedef struct { |
| FILE* file; |
| XXH64_state_t xxhash64; |
| int error; |
| } curl_data_t; |
| |
| /** Create the curl state. */ |
| static curl_data_t curl_data_create( |
| data_resource_t const* resource, |
| data_type_t type) { |
| curl_data_t cdata = {}; |
| |
| XXH64_reset(&cdata.xxhash64, 0); |
| |
| assert(UTIL_isDirectory(g_data_dir)); |
| |
| if (type == data_type_file) { |
| /* Decompress the resource and store to the path. */ |
| char* cmd = cat3("zstd -dqfo '", resource->path, "'"); |
| if (cmd == NULL) { |
| cdata.error = ENOMEM; |
| return cdata; |
| } |
| cdata.file = popen(cmd, "w"); |
| free(cmd); |
| } else { |
| /* Decompress and extract the resource to the cache directory. */ |
| char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'"); |
| if (cmd == NULL) { |
| cdata.error = ENOMEM; |
| return cdata; |
| } |
| cdata.file = popen(cmd, "w"); |
| free(cmd); |
| } |
| if (cdata.file == NULL) { |
| cdata.error = errno; |
| } |
| |
| return cdata; |
| } |
| |
| /** Free the curl state. */ |
| static int curl_data_free(curl_data_t cdata) { |
| return pclose(cdata.file); |
| } |
| |
| /** curl callback. Updates the hash, and writes to the file. */ |
| static size_t curl_write(void* data, size_t size, size_t count, void* ptr) { |
| curl_data_t* cdata = (curl_data_t*)ptr; |
| size_t const written = fwrite(data, size, count, cdata->file); |
| XXH64_update(&cdata->xxhash64, data, written * size); |
| return written; |
| } |
| |
| static int curl_download_resource( |
| CURL* curl, |
| data_resource_t const* resource, |
| data_type_t type) { |
| curl_data_t cdata; |
| /* Download the data. */ |
| if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0) |
| return EINVAL; |
| if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0) |
| return EINVAL; |
| cdata = curl_data_create(resource, type); |
| if (cdata.error != 0) |
| return cdata.error; |
| int const curl_err = curl_easy_perform(curl); |
| int const close_err = curl_data_free(cdata); |
| if (curl_err) { |
| fprintf( |
| stderr, |
| "downloading '%s' for '%s' failed\n", |
| resource->url, |
| resource->path); |
| return EIO; |
| } |
| if (close_err) { |
| fprintf(stderr, "writing data to '%s' failed\n", resource->path); |
| return EIO; |
| } |
| /* check that the file exists. */ |
| if (type == data_type_file && !UTIL_isRegularFile(resource->path)) { |
| fprintf(stderr, "output file '%s' does not exist\n", resource->path); |
| return EIO; |
| } |
| if (type == data_type_dir && !UTIL_isDirectory(resource->path)) { |
| fprintf( |
| stderr, "output directory '%s' does not exist\n", resource->path); |
| return EIO; |
| } |
| /* Check that the hash matches. */ |
| if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) { |
| fprintf( |
| stderr, |
| "checksum does not match: 0x%llxLL != 0x%llxLL\n", |
| (unsigned long long)XXH64_digest(&cdata.xxhash64), |
| (unsigned long long)resource->xxhash64); |
| return EINVAL; |
| } |
| |
| return 0; |
| } |
| |
| /** Download a single data object. */ |
| static int curl_download_datum(CURL* curl, data_t const* data) { |
| int ret; |
| ret = curl_download_resource(curl, &data->data, data->type); |
| if (ret != 0) |
| return ret; |
| if (data_has_dict(data)) { |
| ret = curl_download_resource(curl, &data->dict, data_type_file); |
| if (ret != 0) |
| return ret; |
| } |
| return ret; |
| } |
| |
| /** Download all the data. */ |
| static int curl_download_data(data_t const* const* data) { |
| if (curl_global_init(CURL_GLOBAL_ALL) != 0) |
| return EFAULT; |
| |
| curl_data_t cdata = {}; |
| CURL* curl = curl_easy_init(); |
| int err = EFAULT; |
| |
| if (curl == NULL) |
| return EFAULT; |
| |
| if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0) |
| goto out; |
| if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0) |
| goto out; |
| if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0) |
| goto out; |
| |
| assert(data != NULL); |
| for (; *data != NULL; ++data) { |
| if (curl_download_datum(curl, *data) != 0) |
| goto out; |
| } |
| |
| err = 0; |
| out: |
| curl_easy_cleanup(curl); |
| curl_global_cleanup(); |
| return err; |
| } |
| |
| /** Fill the path member variable of the data objects. */ |
| static int data_create_paths(data_t* const* data, char const* dir) { |
| size_t const dirlen = strlen(dir); |
| assert(data != NULL); |
| for (; *data != NULL; ++data) { |
| data_t* const datum = *data; |
| datum->data.path = cat3(dir, "/", datum->name); |
| if (datum->data.path == NULL) |
| return ENOMEM; |
| if (data_has_dict(datum)) { |
| datum->dict.path = cat2(datum->data.path, ".dict"); |
| if (datum->dict.path == NULL) |
| return ENOMEM; |
| } |
| } |
| return 0; |
| } |
| |
| /** Free the path member variable of the data objects. */ |
| static void data_free_paths(data_t* const* data) { |
| assert(data != NULL); |
| for (; *data != NULL; ++data) { |
| data_t* datum = *data; |
| free((void*)datum->data.path); |
| free((void*)datum->dict.path); |
| datum->data.path = NULL; |
| datum->dict.path = NULL; |
| } |
| } |
| |
| static char const kStampName[] = "STAMP"; |
| |
| static void xxh_update_le(XXH64_state_t* state, uint64_t data) { |
| if (!MEM_isLittleEndian()) |
| data = MEM_swap64(data); |
| XXH64_update(state, &data, sizeof(data)); |
| } |
| |
| /** Hash the data to create the stamp. */ |
| static uint64_t stamp_hash(data_t const* const* data) { |
| XXH64_state_t state; |
| |
| XXH64_reset(&state, 0); |
| assert(data != NULL); |
| for (; *data != NULL; ++data) { |
| data_t const* datum = *data; |
| /* We don't care about the URL that we fetch from. */ |
| /* The path is derived from the name. */ |
| XXH64_update(&state, datum->name, strlen(datum->name)); |
| xxh_update_le(&state, datum->data.xxhash64); |
| xxh_update_le(&state, datum->dict.xxhash64); |
| xxh_update_le(&state, datum->type); |
| } |
| return XXH64_digest(&state); |
| } |
| |
| /** Check if the stamp matches the stamp in the cache directory. */ |
| static int stamp_check(char const* dir, data_t const* const* data) { |
| char* stamp = cat3(dir, "/", kStampName); |
| uint64_t const expected = stamp_hash(data); |
| XXH64_canonical_t actual; |
| FILE* stampfile = NULL; |
| int matches = 0; |
| |
| if (stamp == NULL) |
| goto out; |
| if (!UTIL_isRegularFile(stamp)) { |
| fprintf(stderr, "stamp does not exist: recreating the data cache\n"); |
| goto out; |
| } |
| |
| stampfile = fopen(stamp, "rb"); |
| if (stampfile == NULL) { |
| fprintf(stderr, "could not open stamp: recreating the data cache\n"); |
| goto out; |
| } |
| |
| size_t b; |
| if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) { |
| fprintf(stderr, "invalid stamp: recreating the data cache\n"); |
| goto out; |
| } |
| |
| matches = (expected == XXH64_hashFromCanonical(&actual)); |
| if (matches) |
| fprintf(stderr, "stamp matches: reusing the cached data\n"); |
| else |
| fprintf(stderr, "stamp does not match: recreating the data cache\n"); |
| |
| out: |
| free(stamp); |
| if (stampfile != NULL) |
| fclose(stampfile); |
| return matches; |
| } |
| |
| /** On success write a new stamp, on failure delete the old stamp. */ |
| static int |
| stamp_write(char const* dir, data_t const* const* data, int const data_err) { |
| char* stamp = cat3(dir, "/", kStampName); |
| FILE* stampfile = NULL; |
| int err = EIO; |
| |
| if (stamp == NULL) |
| return ENOMEM; |
| |
| if (data_err != 0) { |
| err = data_err; |
| goto out; |
| } |
| XXH64_canonical_t hash; |
| |
| XXH64_canonicalFromHash(&hash, stamp_hash(data)); |
| |
| stampfile = fopen(stamp, "wb"); |
| if (stampfile == NULL) |
| goto out; |
| if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1) |
| goto out; |
| err = 0; |
| fprintf(stderr, "stamped new data cache\n"); |
| out: |
| if (err != 0) |
| /* Ignore errors. */ |
| unlink(stamp); |
| free(stamp); |
| if (stampfile != NULL) |
| fclose(stampfile); |
| return err; |
| } |
| |
| int data_init(char const* dir) { |
| int err; |
| |
| if (dir == NULL) |
| return EINVAL; |
| |
| /* This must be first to simplify logic. */ |
| err = ensure_directory_exists(dir); |
| if (err != 0) |
| return err; |
| |
| /* Save the cache directory. */ |
| g_data_dir = strdup(dir); |
| if (g_data_dir == NULL) |
| return ENOMEM; |
| |
| err = data_create_paths(g_data, dir); |
| if (err != 0) |
| return err; |
| |
| /* If the stamp matches then we are good to go. |
| * This must be called before any modifications to the data cache. |
| * After this point, we MUST call stamp_write() to update the STAMP, |
| * since we've updated the data cache. |
| */ |
| if (stamp_check(dir, data)) |
| return 0; |
| |
| err = curl_download_data(data); |
| if (err != 0) |
| goto out; |
| |
| out: |
| /* This must be last, since it must know if data_init() succeeded. */ |
| stamp_write(dir, data, err); |
| return err; |
| } |
| |
| void data_finish(void) { |
| data_free_paths(g_data); |
| free(g_data_dir); |
| g_data_dir = NULL; |
| } |