/* Copyright 2016 Google Inc. All Rights Reserved. Author: zip753@gmail.com (Ivan Nikulin) Distributed under MIT license. See file LICENSE for detail or copy at https://opensource.org/licenses/MIT */ /* Tool for generating optimal backward references for the input file. Uses sais-lite library for building suffix array. */ #include <algorithm> #include <cassert> #include <cstdio> #include <cstring> #include <functional> #include <utility> #include <vector> #include <gflags/gflags.h> using gflags::ParseCommandLineFlags; #include "./esaxx/sais.hxx" DEFINE_bool(advanced, false, "Advanced searching mode: finds all longest " "matches at positions that are not covered by matches of length at least " "max_length. WARNING: uses much more memory than simple mode, especially " "for small values of min_length."); DEFINE_int32(min_length, 1, "Minimal length of found backward references."); /* For advanced mode. */ DEFINE_int32(long_length, 32, "Maximal length of found backward references for advanced mode."); DEFINE_int32(skip, 1, "Number of bytes to skip."); const size_t kFileBufferSize = (1 << 16); // 64KB typedef int sarray_type; // Can't make it unsigned because of templates :( typedef uint8_t input_type; typedef uint32_t lcp_type; typedef std::pair<int, std::vector<int> > entry_type; typedef std::function<void(sarray_type*, lcp_type*, size_t, int, int, int, int, int)> Fn; void ReadInput(FILE* fin, input_type* storage, size_t input_size) { size_t last_pos = 0; size_t available_in; fseek(fin, 0, SEEK_SET); do { available_in = fread(storage + last_pos, 1, kFileBufferSize, fin); last_pos += available_in; } while (available_in != 0); assert(last_pos == input_size); } void BuildLCP(input_type* storage, sarray_type* sarray, lcp_type* lcp, size_t size, uint32_t* pos) { for (int i = 0; i < size; ++i) { pos[sarray[i]] = i; } uint32_t k = 0; lcp[size - 1] = 0; for (int i = 0; i < size; ++i) { if (pos[i] == size - 1) { k = 0; continue; } uint32_t j = sarray[pos[i] + 1]; // Suffix which follow i-th suffix in SA. while (i + k < size && j + k < size && storage[i + k] == storage[j + k]) { ++k; } lcp[pos[i]] = k; if (k > 0) --k; } } inline void PrintReference(sarray_type* sarray, lcp_type* lcp, size_t size, int idx, int left_ix, int right_ix, int left_lcp, int right_lcp, FILE* fout) { int max_lcp_ix; if (right_ix == size - 1 || (left_ix >= 0 && left_lcp >= right_lcp)) { max_lcp_ix = left_ix; } else { max_lcp_ix = right_ix; } int dist = idx - sarray[max_lcp_ix]; assert(dist > 0); fputc(1, fout); fwrite(&idx, sizeof(int), 1, fout); // Position in input. fwrite(&dist, sizeof(int), 1, fout); // Backward distance. } inline void GoLeft(sarray_type* sarray, lcp_type* lcp, int idx, int left_ix, int left_lcp, entry_type* entry) { entry->first = left_lcp; if (left_lcp > FLAGS_long_length) return; for (; left_ix >= 0; --left_ix) { if (lcp[left_ix] < left_lcp) break; if (sarray[left_ix] < idx) { entry->second.push_back(idx - sarray[left_ix]); } } } inline void GoRight(sarray_type* sarray, lcp_type* lcp, int idx, size_t size, int right_ix, int right_lcp, entry_type* entry) { entry->first = right_lcp; if (right_lcp > FLAGS_long_length) return; for (; right_ix < size - 1; ++right_ix) { if (lcp[right_ix] < right_lcp) break; if (sarray[right_ix] < idx) { entry->second.push_back(idx - sarray[right_ix]); } } } inline void StoreReference(sarray_type* sarray, lcp_type* lcp, size_t size, int idx, int left_ix, int right_ix, int left_lcp, int right_lcp, entry_type* entries) { if (right_ix == size - 1 || (left_ix >= 0 && left_lcp > right_lcp)) { // right is invalid or left is better GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]); } else if (left_ix < 0 || (right_ix < size - 1 && right_lcp > left_lcp)) { // left is invalid or right is better GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]); } else { // both are valid and of equal length GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]); GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]); } } void ProcessReferences(sarray_type* sarray, lcp_type* lcp, size_t size, uint32_t* pos, const Fn& process_output) { int min_length = FLAGS_min_length; for (int idx = FLAGS_skip; idx < size; ++idx) { int left_lcp = -1; int left_ix; for (left_ix = pos[idx] - 1; left_ix >= 0; --left_ix) { if (left_lcp == -1 || left_lcp > lcp[left_ix]) { left_lcp = lcp[left_ix]; } if (left_lcp == 0) break; if (sarray[left_ix] < idx) break; } int right_lcp = -1; int right_ix; for (right_ix = pos[idx]; right_ix < size - 1; ++right_ix) { if (right_lcp == -1 || right_lcp > lcp[right_ix]) { right_lcp = lcp[right_ix]; } // Stop if we have better result from the left side already. if (right_lcp < left_lcp && left_ix >= 0) break; if (right_lcp == 0) break; if (sarray[right_ix] < idx) break; } if ((left_ix >= 0 && left_lcp >= min_length) || (right_ix < size - 1 && right_lcp >= min_length)) { process_output(sarray, lcp, size, idx, left_ix, right_ix, left_lcp, right_lcp); } } } void ProcessEntries(entry_type* entries, size_t size, FILE* fout) { int long_length = FLAGS_long_length; std::vector<std::pair<int, int> > segments; size_t idx; for (idx = 0; idx < size;) { entry_type& entry = entries[idx]; if (entry.first > long_length) { // Add segment. if (segments.empty() || segments.back().second < idx) { segments.push_back({idx, idx + entry.first}); } else { segments.back().second = idx + entry.first; } } ++idx; } printf("Segments generated.\n"); size_t segments_ix = 0; for (idx = 0; idx < size;) { if (idx == segments[segments_ix].first) { // Skip segment. idx = segments[segments_ix].second; } else { for (auto& dist : entries[idx].second) { fputc(1, fout); fwrite(&idx, sizeof(int), 1, fout); // Position in input. fwrite(&dist, sizeof(int), 1, fout); // Backward distance. } ++idx; } } } int main(int argc, char* argv[]) { ParseCommandLineFlags(&argc, &argv, true); if (argc != 3) { printf("usage: %s input_file output_file\n", argv[0]); return 1; } FILE* fin = fopen(argv[1], "rb"); FILE* fout = fopen(argv[2], "w"); fseek(fin, 0, SEEK_END); int input_size = ftell(fin); fseek(fin, 0, SEEK_SET); printf("The file size is %u bytes\n", input_size); input_type* storage = new input_type[input_size]; ReadInput(fin, storage, input_size); fclose(fin); sarray_type* sarray = new sarray_type[input_size]; saisxx(storage, sarray, input_size); printf("Suffix array calculated.\n"); // Inverse suffix array. uint32_t* pos = new uint32_t[input_size]; lcp_type* lcp = new lcp_type[input_size]; BuildLCP(storage, sarray, lcp, input_size, pos); printf("LCP array constructed.\n"); delete[] storage; using std::placeholders::_1; using std::placeholders::_2; using std::placeholders::_3; using std::placeholders::_4; using std::placeholders::_5; using std::placeholders::_6; using std::placeholders::_7; using std::placeholders::_8; entry_type* entries; if (FLAGS_advanced) { entries = new entry_type[input_size]; for (size_t i = 0; i < input_size; ++i) entries[i].first = -1; } Fn print = std::bind(PrintReference, _1, _2, _3, _4, _5, _6, _7, _8, fout); Fn store = std::bind(StoreReference, _1, _2, _3, _4, _5, _6, _7, _8, entries); ProcessReferences(sarray, lcp, input_size, pos, FLAGS_advanced ? store : print); printf("References processed.\n"); if (FLAGS_advanced) { int good_cnt = 0; uint64_t avg_cnt = 0; for (size_t i = 0; i < input_size; ++i) { if (entries[i].first != -1) { ++good_cnt; avg_cnt += entries[i].second.size(); } } printf("Number of covered positions = %d\n", good_cnt); printf("Average number of references per covered position = %.4lf\n", static_cast<double>(avg_cnt) / good_cnt); ProcessEntries(entries, input_size, fout); printf("Entries processed.\n"); } fclose(fout); return 0; }