Edit

kc3-lang/brotli/research/deorummolae.cc

Branch :

  • Show log

    Commit

  • Author : Evgenii Kliuchnikov
    Date : 2025-01-07 13:59:06
    Hash : 57610b71
    Message : translate includes in brotli/research PiperOrigin-RevId: 713033033

  • research/deorummolae.cc
  • #include "deorummolae.h"
    
    #include <array>
    #include <cstdio>
    
    #include <sais.hxx>
    
    #if defined(_MSC_VER)
    #include <intrin.h>  /* __popcnt64 */
    #endif
    
    /* Used for quick SA-entry to file mapping. Each file is padded to size that
       is a multiple of chunk size. */
    #define CHUNK_SIZE 64
    /* Length of substring that is considered to be covered by dictionary string. */
    #define CUT_MATCH 6
    /* Minimal dictionary entry size. */
    #define MIN_MATCH 24
    
    /* Non tunable definitions. */
    #define CHUNK_MASK (CHUNK_SIZE - 1)
    #define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6))
    
    /* File coverage: every bit set to 1 denotes a file covered by an isle. */
    typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
    
    /* Symbol of text alphabet. */
    typedef int32_t TextChar;
    
    /* Pointer to position in text. */
    typedef uint32_t TextIdx;
    
    /* SAIS sarray_type; unfortunately, must be a signed type. */
    typedef int32_t TextSaIdx;
    
    static size_t popcount(uint64_t u) {
    #if defined(_MSC_VER)
      return static_cast<size_t>(__popcnt64(u));
    #else
      return static_cast<size_t>(__builtin_popcountll(u));
    #endif
    }
    
    /* Condense terminators and pad file entries. */
    static void rewriteText(std::vector<TextChar>* text) {
      TextChar terminator = text->back();
      TextChar prev = terminator;
      TextIdx to = 0;
      for (TextIdx from = 0; from < text->size(); ++from) {
        TextChar next = text->at(from);
        if (next < 256 || prev < 256) {
          text->at(to++) = next;
          if (next >= 256) terminator = next;
        }
        prev = next;
      }
      text->resize(to);
      if (text->empty()) text->push_back(terminator);
      while (text->size() & CHUNK_MASK) text->push_back(terminator);
    }
    
    /* Reenumerate terminators for smaller alphabet. */
    static void remapTerminators(std::vector<TextChar>* text,
        TextChar* next_terminator) {
      TextChar prev = -1;
      TextChar x = 256;
      for (TextIdx i = 0; i < text->size(); ++i) {
        TextChar next = text->at(i);
        if (next < 256) {  // Char.
          // Do nothing.
        } else if (prev < 256) {  // Terminator after char.
          next = x++;
        } else {  // Terminator after terminator.
          next = prev;
        }
        text->at(i) = next;
        prev = next;
      }
      *next_terminator = x;
    }
    
    /* Combine all file entries; create mapping position->file. */
    static void buildFullText(std::vector<std::vector<TextChar>>* data,
        std::vector<TextChar>* full_text, std::vector<TextIdx>* file_map,
        std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
      file_map->resize(0);
      file_offset->resize(0);
      full_text->resize(0);
      for (TextIdx i = 0; i < data->size(); ++i) {
        file_offset->push_back(full_text->size());
        std::vector<TextChar>& file = data->at(i);
        rewriteText(&file);
        full_text->insert(full_text->end(), file.begin(), file.end());
        file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
      }
      if (false) remapTerminators(full_text, next_terminator);
    }
    
    /* Build longest-common-prefix based on suffix array and text.
       TODO(eustas): borrowed -> unknown efficiency. */
    static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa,
        std::vector<TextIdx>* lcp, std::vector<TextIdx>* invese_sa) {
      TextIdx size = static_cast<TextIdx>(text->size());
      lcp->resize(size);
      TextIdx k = 0;
      lcp->at(size - 1) = 0;
      for (TextIdx i = 0; i < size; ++i) {
        if (invese_sa->at(i) == size - 1) {
          k = 0;
          continue;
        }
        // Suffix which follow i-th suffix.
        TextIdx j = sa->at(invese_sa->at(i) + 1);
        while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
          ++k;
        }
        lcp->at(invese_sa->at(i)) = k;
        if (k > 0) --k;
      }
    }
    
    /* Isle is a range in SA with LCP not less than some value.
       When we raise the LCP requirement, the isle sunks and smaller isles appear
       instead. */
    typedef struct {
      TextIdx lcp;
      TextIdx l;
      TextIdx r;
      Coverage coverage;
    } Isle;
    
    /* Helper routine for `cutMatch`. */
    static void poisonData(TextIdx pos, TextIdx length,
        std::vector<std::vector<TextChar>>* data, std::vector<TextIdx>* file_map,
        std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
      TextIdx f = file_map->at(pos / CHUNK_SIZE);
      pos -= file_offset->at(f);
      std::vector<TextChar>& file = data->at(f);
      TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1;
      for (TextIdx j = 0; j < l; j++, pos++) {
        if (file[pos] >= 256) continue;
        if (file[pos + 1] >= 256) {
          file[pos] = file[pos + 1];
        } else if (pos > 0 && file[pos - 1] >= 256) {
          file[pos] = file[pos - 1];
        } else {
          file[pos] = (*next_terminator)++;
        }
      }
    }
    
    /* Remove substrings of a given match from files.
       Substrings are replaced with unique terminators, so next iteration SA would
       not allow to cross removed areas. */
    static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index,
        TextIdx length, std::vector<TextIdx>* sa, std::vector<TextIdx>* lcp,
        std::vector<TextIdx>* invese_sa, TextChar* next_terminator,
        std::vector<TextIdx>* file_map, std::vector<TextIdx>* file_offset) {
      while (length >= CUT_MATCH) {
        TextIdx i = index;
        while (lcp->at(i) >= length) {
          i++;
          poisonData(
              sa->at(i), length, data, file_map, file_offset, next_terminator);
        }
        while (true) {
          poisonData(
              sa->at(index), length, data, file_map, file_offset, next_terminator);
          if (index == 0 || lcp->at(index - 1) < length) break;
          index--;
        }
        length--;
        index = invese_sa->at(sa->at(index) + 1);
      }
    }
    
    std::string DM_generate(size_t dictionary_size_limit,
        const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
      {
        TextIdx tmp = static_cast<TextIdx>(dictionary_size_limit);
        if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) {
          fprintf(stderr, "dictionary_size_limit is too large\n");
          return "";
        }
      }
    
      /* Could use 256 + '0' for easier debugging. */
      TextChar next_terminator = 256;
    
      std::string output;
      std::vector<std::vector<TextChar>> data;
    
      TextIdx offset = 0;
      size_t num_samples = sample_sizes.size();
      if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES;
      for (size_t n = 0; n < num_samples; ++n) {
        TextIdx delta = static_cast<TextIdx>(sample_sizes[n]);
        if (delta != sample_sizes[n]) {
          fprintf(stderr, "sample is too large\n");
          return "";
        }
        if (delta == 0) {
          fprintf(stderr, "0-length samples are prohibited\n");
          return "";
        }
        TextIdx next_offset = offset + delta;
        if (next_offset <= offset) {
          fprintf(stderr, "corpus is too large\n");
          return "";
        }
        data.push_back(
            std::vector<TextChar>(sample_data + offset, sample_data + next_offset));
        offset = next_offset;
        data.back().push_back(next_terminator++);
      }
    
      /* Most arrays are allocated once, and then just resized to smaller and
         smaller sizes. */
      std::vector<TextChar> full_text;
      std::vector<TextIdx> file_map;
      std::vector<TextIdx> file_offset;
      std::vector<TextIdx> sa;
      std::vector<TextIdx> invese_sa;
      std::vector<TextIdx> lcp;
      std::vector<Isle> isles;
      std::vector<char> output_data;
      TextIdx total = 0;
      TextIdx total_cost = 0;
      TextIdx best_cost;
      Isle best_isle;
      size_t min_count = num_samples;
    
      while (true) {
        TextIdx max_match = static_cast<TextIdx>(dictionary_size_limit) - total;
        buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
        sa.resize(full_text.size());
        /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */
        saisxx(full_text.data(), reinterpret_cast<TextSaIdx*>(sa.data()),
            static_cast<TextChar>(full_text.size()), next_terminator);
        invese_sa.resize(full_text.size());
        for (TextIdx i = 0; i < full_text.size(); ++i) {
          invese_sa[sa[i]] = i;
        }
        buildLcp(&full_text, &sa, &lcp, &invese_sa);
    
        /* Do not rebuild SA/LCP, just use different selection. */
      retry:
        best_cost = 0;
        best_isle = {0, 0, 0, {{0}}};
        isles.resize(0);
        isles.push_back(best_isle);
    
        for (TextIdx i = 0; i < lcp.size(); ++i) {
          TextIdx l = i;
          Coverage cov = {{0}};
          size_t f = file_map[sa[i] / CHUNK_SIZE];
          cov[f >> 6] = (static_cast<uint64_t>(1)) << (f & 63);
          while (lcp[i] < isles.back().lcp) {
            Isle& top = isles.back();
            top.r = i;
            l = top.l;
            for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
            size_t count = 0;
            for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
            TextIdx effective_lcp = top.lcp;
            /* Restrict (last) dictionary entry length. */
            if (effective_lcp > max_match) effective_lcp = max_match;
            TextIdx cost = count * effective_lcp;
            if (cost > best_cost && count >= min_count &&
                effective_lcp >= MIN_MATCH) {
              best_cost = cost;
              best_isle = top;
              best_isle.lcp = effective_lcp;
            }
            isles.pop_back();
            for (size_t x = 0; x < cov.size(); ++x) {
              isles.back().coverage[x] |= cov[x];
            }
          }
          if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}});
          for (size_t x = 0; x < cov.size(); ++x) {
            isles.back().coverage[x] |= cov[x];
          }
        }
    
        /* When saturated matches do not match length restrictions, lower the
           saturation requirements. */
        if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
          if (min_count >= 8) {
            min_count = (min_count * 7) / 8;
            fprintf(stderr, "Retry: min_count=%zu\n", min_count);
            goto retry;
          }
          break;
        }
    
        /* Save the entry. */
        fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n",
            total_cost, best_cost, total, best_isle.lcp);
        int* piece = &full_text[sa[best_isle.l]];
        output.insert(output.end(), piece, piece + best_isle.lcp);
        total += best_isle.lcp;
        total_cost += best_cost;
        cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa,
            &next_terminator, &file_map, &file_offset);
        if (total >= dictionary_size_limit) break;
      }
    
      return output;
    }