/* Copyright (C) 2007 Josh MacDonald */
extern "C" {
#include "test.h"
#include <assert.h>
}
#include <list>
#include <vector>
#include <map>
#include <algorithm>
using std::list;
using std::map;
using std::vector;
// MLCG parameters
// a, a*
uint32_t good_32bit_values[] = {
1597334677U, // ...
741103597U, 887987685U,
};
// a, a*
uint64_t good_64bit_values[] = {
1181783497276652981ULL, 4292484099903637661ULL,
7664345821815920749ULL, // ...
};
struct true_type { };
struct false_type { };
template <typename Word>
int bitsof();
template<>
int bitsof<uint32_t>() {
return 32;
}
template<>
int bitsof<uint64_t>() {
return 64;
}
struct plain {
int operator()(const uint8_t &c) {
return c;
}
};
template <typename Word>
struct hhash { // take "h" of the high-bits as a hash value for this
// checksum, which are the most "distant" in terms of the
// spectral test for the rabin_karp MLCG. For short windows,
// the high bits aren't enough, XOR "mask" worth of these in.
Word operator()(const Word& t, const int &h, const int &mask) {
return (t >> h) ^ (t & mask);
}
};
template <typename Word>
Word good_word();
template<>
uint32_t good_word<uint32_t>() {
return good_32bit_values[0];
}
template<>
uint64_t good_word<uint64_t>() {
return good_64bit_values[0];
}
// CLASSES
#define SELF Word, CksumSize, CksumSkip, Permute, Hash, Compaction
#define MEMBER template <typename Word, \
int CksumSize, \
int CksumSkip, \
typename Permute, \
typename Hash, \
int Compaction>
MEMBER
struct cksum_params {
typedef Word word_type;
typedef Permute permute_type;
typedef Hash hash_type;
enum { cksum_size = CksumSize,
cksum_skip = CksumSkip,
compaction = Compaction,
};
};
MEMBER
struct rabin_karp {
typedef Word word_type;
typedef Permute permute_type;
typedef Hash hash_type;
enum { cksum_size = CksumSize,
cksum_skip = CksumSkip,
compaction = Compaction,
};
// (a^cksum_size-1 c_0) + (a^cksum_size-2 c_1) ...
rabin_karp() {
multiplier = good_word<Word>();
powers = new Word[cksum_size];
powers[cksum_size - 1] = 1;
for (int i = cksum_size - 2; i >= 0; i--) {
powers[i] = powers[i + 1] * multiplier;
}
product = powers[0] * multiplier;
}
~rabin_karp() {
delete [] powers;
}
Word step(const uint8_t *ptr) {
Word h = 0;
for (int i = 0; i < cksum_size; i++) {
h += permute_type()(ptr[i]) * powers[i];
}
return h;
}
Word state0(const uint8_t *ptr) {
incr_state = step(ptr);
return incr_state;
}
Word incr(const uint8_t *ptr) {
incr_state = multiplier * incr_state -
product * permute_type()(ptr[-1]) +
permute_type()(ptr[cksum_size - 1]);
return incr_state;
}
Word *powers;
Word product;
Word multiplier;
Word incr_state;
};
MEMBER
struct adler32_cksum {
typedef Word word_type;
typedef Permute permute_type;
typedef Hash hash_type;
enum { cksum_size = CksumSize,
cksum_skip = CksumSkip,
compaction = Compaction,
};
Word step(const uint8_t *ptr) {
return xd3_lcksum (ptr, cksum_size);
}
Word state0(const uint8_t *ptr) {
incr_state = step(ptr);
return incr_state;
}
Word incr(const uint8_t *ptr) {
incr_state = xd3_large_cksum_update (incr_state, ptr - 1, cksum_size);
return incr_state;
}
Word incr_state;
};
// TESTS
template <typename Word>
struct file_stats {
typedef list<const uint8_t*> ptr_list;
typedef Word word_type;
typedef map<word_type, ptr_list> table_type;
typedef typename table_type::iterator table_iterator;
typedef typename ptr_list::iterator ptr_iterator;
int cksum_size;
int cksum_skip;
int unique;
int unique_values;
int count;
table_type table;
file_stats(int size, int skip)
: cksum_size(size),
cksum_skip(skip),
unique(0),
unique_values(0),
count(0) {
}
void reset() {
unique = 0;
unique_values = 0;
count = 0;
table.clear();
}
void update(const word_type &word, const uint8_t *ptr) {
table_iterator t_i = table.find(word);
count++;
if (t_i == table.end()) {
table.insert(make_pair(word, ptr_list()));
}
ptr_list &pl = table[word];
for (ptr_iterator p_i = pl.begin();
p_i != pl.end();
++p_i) {
if (memcmp(*p_i, ptr, cksum_size) == 0) {
return;
}
}
unique++;
pl.push_back(ptr);
}
void freeze() {
unique_values = table.size();
table.clear();
}
};
struct test_result_base;
static vector<test_result_base*> all_tests;
struct test_result_base {
virtual ~test_result_base() {
}
virtual void reset() = 0;
virtual void print() = 0;
virtual void get(const uint8_t* buf, const int buf_size, int iters) = 0;
virtual void stat() = 0;
virtual int count() = 0;
virtual int dups() = 0;
virtual double uniqueness() = 0;
virtual double fullness() = 0;
virtual double collisions() = 0;
virtual double coverage() = 0;
virtual double compression() = 0;
virtual double time() = 0;
virtual double score() = 0;
virtual void set_score(double min_dups_frac, double min_time) = 0;
virtual double total_time() = 0;
virtual int total_count() = 0;
virtual int total_dups() = 0;
};
struct compare_h {
bool operator()(test_result_base *a,
test_result_base *b) {
return a->score() < b->score();
}
};
MEMBER
struct test_result : public test_result_base {
typedef Word word_type;
typedef Permute permute_type;
typedef Hash hash_type;
enum { cksum_size = CksumSize,
cksum_skip = CksumSkip,
compaction = Compaction,
};
const char *test_name;
file_stats<Word> fstats;
int test_size;
int n_steps;
int n_incrs;
int s_bits;
int s_mask;
int t_entries;
int h_bits;
int h_buckets_full;
double h_score;
char *hash_table;
long accum_millis;
int accum_iters;
// These are not reset
double accum_time;
int accum_count;
int accum_dups;
int accum_colls;
int accum_size;
test_result(const char *name)
: test_name(name),
fstats(cksum_size, cksum_skip),
hash_table(NULL),
accum_millis(0),
accum_iters(0),
accum_time(0.0),
accum_count(0),
accum_dups(0),
accum_colls(0),
accum_size(0) {
all_tests.push_back(this);
}
~test_result() {
reset();
}
void reset() {
// size of file
test_size = -1;
// count
n_steps = -1;
n_incrs = -1;
// four values used by new_table()/summarize_table()
s_bits = -1;
s_mask = -1;
t_entries = -1;
h_bits = -1;
h_buckets_full = -1;
accum_millis = 0;
accum_iters = 0;
fstats.reset();
// temporary
if (hash_table) {
delete(hash_table);
hash_table = NULL;
}
}
int count() {
if (cksum_skip == 1) {
return n_incrs;
} else {
return n_steps;
}
}
int dups() {
return fstats.count - fstats.unique;
}
int colls() {
return fstats.unique - fstats.unique_values;
}
double uniqueness() {
return 1.0 - (double) dups() / count();
}
double fullness() {
return (double) h_buckets_full / (1 << h_bits);
}
double collisions() {
return (double) colls() / fstats.unique;
}
double coverage() {
return (double) h_buckets_full / uniqueness() / count();
}
double compression() {
return 1.0 - coverage();
}
double time() {
return (double) accum_millis / accum_iters;
}
double score() {
return h_score;
}
void set_score(double min_compression, double min_time) {
h_score = (compression() - 0.99 * min_compression)
* (time() - 0.99 * min_time);
}
double total_time() {
return accum_time;
}
int total_count() {
return accum_count;
}
int total_dups() {
return accum_dups;
}
int total_colls() {
return accum_dups;
}
void stat() {
accum_time += time();
accum_count += count();
accum_dups += dups();
accum_colls += colls();
accum_size += test_size;
}
void print() {
if (fstats.count != count()) {
fprintf(stderr, "internal error: %d != %d\n", fstats.count, count());
abort();
}
printf("%s: (%u#%u) count %u uniq %0.2f%% full %u (%0.4f%% coll %0.4f%%) covers %0.2f%% w/ 2^%d @ %.4f MB/s %u iters\n",
test_name,
cksum_size,
cksum_skip,
count(),
100.0 * uniqueness(),
h_buckets_full,
100.0 * fullness(),
100.0 * collisions(),
100.0 * coverage(),
h_bits,
0.001 * accum_iters * test_size / accum_millis,
accum_iters);
}
int size_log2 (int slots)
{
int bits = bitsof<word_type>() - 1;
int i;
for (i = 3; i <= bits; i += 1) {
if (slots <= (1 << i)) {
return i - compaction;
}
}
return bits;
}
void new_table(int entries) {
t_entries = entries;
h_bits = size_log2(entries);
int n = 1 << h_bits;
s_bits = bitsof<word_type>() - h_bits;
s_mask = n - 1;
hash_table = new char[n / 8];
memset(hash_table, 0, n / 8);
}
int get_table_bit(int i) {
return hash_table[i/8] & (1 << i%8);
}
int set_table_bit(int i) {
return hash_table[i/8] |= (1 << i%8);
}
void summarize_table() {
int n = 1 << h_bits;
int f = 0;
for (int i = 0; i < n; i++) {
if (get_table_bit(i)) {
f++;
}
}
h_buckets_full = f;
}
void get(const uint8_t* buf, const int buf_size, int test_iters) {
rabin_karp<SELF> test;
//adler32_cksum<SELF> test;
hash_type hash;
const uint8_t *ptr;
const uint8_t *end;
int last_offset;
int periods;
int stop;
test_size = buf_size;
last_offset = buf_size - cksum_size;
if (last_offset < 0) {
periods = 0;
n_steps = 0;
n_incrs = 0;
stop = -cksum_size;
} else {
periods = last_offset / cksum_skip;
n_steps = periods + 1;
n_incrs = last_offset + 1;
stop = last_offset - (periods + 1) * cksum_skip;
}
// Compute file stats once.
if (fstats.unique_values == 0) {
if (cksum_skip == 1) {
for (int i = 0; i <= buf_size - cksum_size; i++) {
fstats.update(hash(test.step(buf + i), s_bits, s_mask), buf + i);
}
} else {
ptr = buf + last_offset;
end = buf + stop;
for (; ptr != end; ptr -= cksum_skip) {
fstats.update(hash(test.step(ptr), s_bits, s_mask), ptr);
}
}
fstats.freeze();
}
long start_test = get_millisecs_now();
if (cksum_skip != 1) {
new_table(n_steps);
for (int i = 0; i < test_iters; i++) {
ptr = buf + last_offset;
end = buf + stop;
for (; ptr != end; ptr -= cksum_skip) {
set_table_bit(hash(test.step(ptr), s_bits, s_mask));
}
}
summarize_table();
}
stop = buf_size - cksum_size + 1;
if (stop < 0) {
stop = 0;
}
if (cksum_skip == 1) {
new_table(n_incrs);
for (int i = 0; i < test_iters; i++) {
ptr = buf;
end = buf + stop;
if (ptr != end) {
set_table_bit(hash(test.state0(ptr++), s_bits, s_mask));
}
for (; ptr != end; ptr++) {
Word w = test.incr(ptr);
assert(w == test.step(ptr));
set_table_bit(hash(w, s_bits, s_mask));
}
}
summarize_table();
}
accum_iters += test_iters;
accum_millis += get_millisecs_now() - start_test;
}
};
template <typename Word>
void print_array(const char *tname) {
printf("static const %s hash_multiplier[64] = {\n", tname);
Word p = 1;
for (int i = 0; i < 64; i++) {
printf(" %uU,\n", p);
p *= good_word<Word>();
}
printf("};\n", tname);
}
int main(int argc, char** argv) {
int i;
uint8_t *buf = NULL;
size_t buf_len = 0;
int ret;
if (argc <= 1) {
fprintf(stderr, "usage: %s file ...\n", argv[0]);
return 1;
}
//print_array<uint32_t>("uint32_t");
#define TEST(T,Z,S,P,H,C) test_result<T,Z,S,P,H<T>,C> \
_ ## T ## _ ## Z ## _ ## S ## _ ## P ## _ ## H ## _ ## C \
(#T "_" #Z "_" #S "_" #P "_" #H "_" #C)
#if 0
TEST(uint32_t, 4, SKIP, plain, hhash, 0); /* x */ \
TEST(uint32_t, 4, SKIP, plain, hhash, 1); /* x */ \
TEST(uint32_t, 4, SKIP, plain, hhash, 2); /* x */ \
TEST(uint32_t, 4, SKIP, plain, hhash, 3); /* x */ \
#endif
#define TESTS(SKIP) \
TEST(uint32_t, 9, SKIP, plain, hhash, 0); /* x */ \
TEST(uint32_t, 9, SKIP, plain, hhash, 1); /* x */ \
TEST(uint32_t, 9, SKIP, plain, hhash, 2); /* x */ \
TEST(uint32_t, 9, SKIP, plain, hhash, 3)
#define TESTS_ALL(SKIP) \
TEST(uint32_t, 3, SKIP, plain, hhash, 0); \
TEST(uint32_t, 3, SKIP, plain, hhash, 1); \
TEST(uint32_t, 4, SKIP, plain, hhash, 0); /* x */ \
TEST(uint32_t, 4, SKIP, plain, hhash, 1); /* x */ \
TEST(uint32_t, 4, SKIP, plain, hhash, 2); /* x */ \
TEST(uint32_t, 4, SKIP, plain, hhash, 3); /* x */ \
TEST(uint32_t, 5, SKIP, plain, hhash, 0); \
TEST(uint32_t, 5, SKIP, plain, hhash, 1); \
TEST(uint32_t, 8, SKIP, plain, hhash, 0); \
TEST(uint32_t, 8, SKIP, plain, hhash, 1); \
TEST(uint32_t, 9, SKIP, plain, hhash, 0); /* x */ \
TEST(uint32_t, 9, SKIP, plain, hhash, 1); /* x */ \
TEST(uint32_t, 9, SKIP, plain, hhash, 2); /* x */ \
TEST(uint32_t, 9, SKIP, plain, hhash, 3); /* x */ \
TEST(uint32_t, 11, SKIP, plain, hhash, 0); /* x */ \
TEST(uint32_t, 11, SKIP, plain, hhash, 1); /* x */ \
TEST(uint32_t, 13, SKIP, plain, hhash, 0); \
TEST(uint32_t, 13, SKIP, plain, hhash, 1); \
TEST(uint32_t, 15, SKIP, plain, hhash, 0); /* x */ \
TEST(uint32_t, 15, SKIP, plain, hhash, 1); /* x */ \
TEST(uint32_t, 16, SKIP, plain, hhash, 0); /* x */ \
TEST(uint32_t, 16, SKIP, plain, hhash, 1); /* x */ \
TEST(uint32_t, 21, SKIP, plain, hhash, 0); \
TEST(uint32_t, 21, SKIP, plain, hhash, 1); \
TEST(uint32_t, 34, SKIP, plain, hhash, 0); \
TEST(uint32_t, 34, SKIP, plain, hhash, 1); \
TEST(uint32_t, 55, SKIP, plain, hhash, 0); \
TEST(uint32_t, 55, SKIP, plain, hhash, 1)
TESTS(1); // *
// TESTS(2); // *
// TESTS(3); // *
// TESTS(5); // *
// TESTS(8); // *
// TESTS(9);
// TESTS(11);
// TESTS(13); // *
TESTS(15);
// TESTS(16);
// TESTS(21); // *
// TESTS(34); // *
// TESTS(55); // *
// TESTS(89); // *
for (i = 1; i < argc; i++) {
if ((ret = read_whole_file(argv[i],
& buf,
& buf_len))) {
return 1;
}
fprintf(stderr, "file %s is %zu bytes\n",
argv[i], buf_len);
double min_time = -1.0;
double min_compression = 0.0;
for (vector<test_result_base*>::iterator i = all_tests.begin();
i != all_tests.end(); ++i) {
test_result_base *test = *i;
test->reset();
int iters = 100;
long start_test = get_millisecs_now();
do {
test->get(buf, buf_len, iters);
iters *= 3;
iters /= 2;
} while (get_millisecs_now() - start_test < 2000);
test->stat();
if (min_time < 0.0) {
min_compression = test->compression();
min_time = test->time();
}
if (min_time > test->time()) {
min_time = test->time();
}
if (min_compression > test->compression()) {
min_compression = test->compression();
}
test->print();
}
// for (vector<test_result_base*>::iterator i = all_tests.begin();
// i != all_tests.end(); ++i) {
// test_result_base *test = *i;
// test->set_score(min_compression, min_time);
// }
// sort(all_tests.begin(), all_tests.end(), compare_h());
// for (vector<test_result_base*>::iterator i = all_tests.begin();
// i != all_tests.end(); ++i) {
// test_result_base *test = *i;
// test->print();
// }
free(buf);
buf = NULL;
}
return 0;
}