Edit

kc3-lang/brotli/research/sieve.cc

Branch :

  • Show log

    Commit

  • Author : Eugene Kliuchnikov
    Date : 2021-09-08 09:18:45
    Hash : 62662f87
    Message : Strip "./" in includes (#925) Co-authored-by: Eugene Kliuchnikov <eustas@chromium.org>

  • research/sieve.cc
  • #include "sieve.h"
    
    /* Pointer to position in (combined corpus) text. */
    typedef uint32_t TextIdx;
    
    /* Index of sample / generation. */
    typedef uint16_t SampleIdx;
    
    typedef struct Slot {
      TextIdx next;
      TextIdx offset;
      SampleIdx presence;
      SampleIdx mark;
    } Slot;
    
    static const TextIdx kNowhere = static_cast<TextIdx>(-1);
    
    static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
        TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
      TextIdx from = kNowhere;
      TextIdx to = kNowhere;
      TextIdx result = 0;
      SampleIdx targetPresence = minPresence;
      for (TextIdx i = 0; i < end; ++i) {
        if (i == middle) {
          targetPresence++;
        }
        Slot& item = map[shortcut[i]];
        if (item.mark != iteration) {
          item.mark = iteration;
          if (item.presence >= targetPresence) {
            if ((to == kNowhere) || (to < i)) {
              if (from != kNowhere) {
                result += to - from;
              }
              from = i;
            }
            to = i + sliceLen;
          }
        }
      }
      if (from != kNowhere) {
        result += to - from;
      }
      return result;
    }
    
    static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
        Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
        SampleIdx minPresence, SampleIdx iteration) {
      std::string output;
      TextIdx from = kNowhere;
      TextIdx to = kNowhere;
      SampleIdx targetPresence = minPresence;
      for (TextIdx i = 0; i < end; ++i) {
        if (i == middle) {
          targetPresence++;
        }
        Slot& item = map[shortcut[i]];
        if (item.mark != iteration) {
          item.mark = iteration;
          if (item.presence >= targetPresence) {
            if ((to == kNowhere) || (to < i)) {
              if (from != kNowhere) {
                output.insert(output.end(), &data[from], &data[to]);
              }
              from = i;
            }
            to = i + sliceLen;
          }
        }
      }
      if (from != kNowhere) {
        output.insert(output.end(), &data[from], &data[to]);
      }
      return output;
    }
    
    std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
        const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
      /* Parameters aliasing */
      TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
      if (targetSize != dictionary_size_limit) {
        fprintf(stderr, "dictionary_size_limit is too large\n");
        return "";
      }
      TextIdx sliceLen = static_cast<TextIdx>(slice_len);
      if (sliceLen != slice_len) {
        fprintf(stderr, "slice_len is too large\n");
        return "";
      }
      if (sliceLen < 1) {
        fprintf(stderr, "slice_len is too small\n");
        return "";
      }
      SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
      if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) {
        fprintf(stderr, "too many samples\n");
        return "";
      }
      const uint8_t* data = sample_data;
    
      TextIdx total = 0;
      std::vector<TextIdx> offsets;
      for (SampleIdx i = 0; i < numSamples; ++i) {
        TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
        if (delta != sample_sizes[i]) {
          fprintf(stderr, "sample is too large\n");
          return "";
        }
        if (delta == 0) {
          fprintf(stderr, "empty samples are prohibited\n");
          return "";
        }
        if (total + delta <= total) {
          fprintf(stderr, "corpus is too large\n");
          return "";
        }
        total += delta;
        offsets.push_back(total);
      }
    
      if (total * 2 < total) {
        fprintf(stderr, "corpus is too large\n");
        return "";
      }
    
      if (total < sliceLen) {
        fprintf(stderr, "slice_len is larger than corpus size\n");
        return "";
      }
    
      /*****************************************************************************
       * Build coverage map.
       ****************************************************************************/
      std::vector<Slot> map;
      std::vector<TextIdx> shortcut;
      map.push_back({0, 0, 0, 0});
      TextIdx end = total - sliceLen;
      TextIdx hashLen = 11;
      while (hashLen < 29 && ((1u << hashLen) < end)) {
        hashLen += 3;
      }
      hashLen -= 3;
      TextIdx hashMask = (1u << hashLen) - 1u;
      std::vector<TextIdx> hashHead(1 << hashLen);
      TextIdx hashSlot = 1;
      SampleIdx piece = 0;
      TextIdx hash = 0;
      TextIdx lShift = 3;
      TextIdx rShift = hashLen - lShift;
      for (TextIdx i = 0; i < sliceLen - 1; ++i) {
        TextIdx v = data[i];
        hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
      }
      TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
      TextIdx rShiftX = hashLen - lShiftX;
      for (TextIdx i = 0; i < end; ++i) {
        TextIdx v = data[i + sliceLen - 1];
        hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
    
        if (offsets[piece] == i) {
          piece++;
        }
        TextIdx slot = hashHead[hash];
        while (slot != 0) {
          Slot& item = map[slot];
          TextIdx start = item.offset;
          bool miss = false;
          for (TextIdx j = 0; j < sliceLen; ++j) {
            if (data[i + j] != data[start + j]) {
              miss = true;
              break;
            }
          }
          if (!miss) {
            if (item.mark != piece) {
              item.mark = piece;
              item.presence++;
            }
            shortcut.push_back(slot);
            break;
          }
          slot = item.next;
        }
        if (slot == 0) {
          map.push_back({hashHead[hash], i, 1, piece});
          hashHead[hash] = hashSlot;
          shortcut.push_back(hashSlot);
          hashSlot++;
        }
        v = data[i];
        hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
      }
    
      /*****************************************************************************
       * Build dictionary of specified size.
       ****************************************************************************/
      SampleIdx a = 1;
      TextIdx size = dryRun(
          sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
      /* Maximal output is smaller than target. */
      if (size <= targetSize) {
        return createDictionary(
            data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
      }
    
      SampleIdx b = numSamples;
      size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
      if (size == targetSize) {
        return createDictionary(
            data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
      }
      /* Run binary search. */
      if (size < targetSize) {
        /* size(a) > targetSize > size(b) && a < m < b */
        while (a + 1 < b) {
          SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
          size = dryRun(
              sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
          if (size < targetSize) {
            b = m;
          } else if (size > targetSize) {
            a = m;
          } else {
            return createDictionary(
                data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
          }
        }
      } else {
        a = b;
      }
      /* size(minPresence) > targetSize > size(minPresence + 1) */
      SampleIdx minPresence = a;
      TextIdx c = 0;
      TextIdx d = end;
      /* size(a) < targetSize < size(b) && a < m < b */
      while (c + 1 < d) {
        TextIdx m = (c + d) / 2;
        size = dryRun(
            sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
        if (size < targetSize) {
          c = m;
        } else if (size > targetSize) {
          d = m;
        } else {
          return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
              m, minPresence, ++piece);
        }
      }
    
      bool unrestricted = false;
      if (minPresence <= 2 && !unrestricted) {
        minPresence = 2;
        c = end;
      }
      return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
          minPresence, ++piece);
    }