Hash :
0454ab4e
Author :
Date :
2014-02-14T15:04:23
Updates to Brotli compression format, decoder and encoder
This commit contains a batch of changes that were made to the Brotli
compression algorithm in the last month. Most important changes:
* Fixes to the spec.
* Change of code length code order.
* Use a 2-level Huffman lookup table in the decoder.
* Faster uncompressed meta-block decoding.
* Optimized encoding of the Huffman code.
* Detection of UTF-8 input encoding.
* UTF-8 based literal cost modeling for improved
backward reference selection.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Function to find backward reference copies.
#include "./backward_references.h"
#include <algorithm>
#include <vector>
#include "./command.h"
namespace brotli {
void CreateBackwardReferences(size_t num_bytes,
size_t position,
const uint8_t* ringbuffer,
const float* literal_cost,
size_t ringbuffer_mask,
const size_t max_backward_limit,
Hasher* hasher,
std::vector<Command>* commands) {
// Length heuristic that seems to help probably by better selection
// of lazy matches of similar lengths.
int insert_length = 0;
size_t i = position & ringbuffer_mask;
const int i_diff = position - i;
const size_t i_end = i + num_bytes;
double average_cost = 0.0;
for (int k = position; k < position + num_bytes; ++k) {
average_cost += literal_cost[k & ringbuffer_mask];
}
average_cost /= num_bytes;
hasher->set_average_cost(average_cost);
// M1 match is for considering for two repeated copies, if moving
// one literal form the previous copy to the current one allows the
// current copy to be more efficient (because the way static dictionary
// codes words). M1 matching improves text compression density by ~0.15 %.
bool match_found_M1 = false;
size_t best_len_M1 = 0;
size_t best_len_code_M1 = 0;
size_t best_dist_M1 = 0;
double best_score_M1 = 0;
while (i + 2 < i_end) {
size_t best_len = 0;
size_t best_len_code = 0;
size_t best_dist = 0;
double best_score = 0;
size_t max_distance = std::min(i + i_diff, max_backward_limit);
bool in_dictionary;
hasher->set_insert_length(insert_length);
bool match_found = hasher->FindLongestMatch(
ringbuffer, literal_cost, ringbuffer_mask,
i + i_diff, i_end - i, max_distance,
&best_len, &best_len_code, &best_dist, &best_score, &in_dictionary);
bool best_in_dictionary = in_dictionary;
if (match_found) {
if (match_found_M1 && best_score_M1 > best_score) {
// Two copies after each other. Take the last literal from the
// last copy, and use it as the first of this one.
(commands->rbegin())->copy_length_ -= 1;
(commands->rbegin())->copy_length_code_ -= 1;
hasher->Store(ringbuffer + i, i + i_diff);
--i;
best_len = best_len_M1;
best_len_code = best_len_code_M1;
best_dist = best_dist_M1;
best_score = best_score_M1;
// in_dictionary doesn't need to be correct, but it is the only
// reason why M1 matching should be beneficial here. Setting it here
// will only disable further M1 matching against this copy.
best_in_dictionary = true;
in_dictionary = true;
} else {
// Found a match. Let's look for something even better ahead.
int delayed_backward_references_in_row = 0;
while (i + 4 < i_end &&
delayed_backward_references_in_row < 4) {
size_t best_len_2 = 0;
size_t best_len_code_2 = 0;
size_t best_dist_2 = 0;
double best_score_2 = 0;
max_distance = std::min(i + i_diff + 1, max_backward_limit);
hasher->Store(ringbuffer + i, i + i_diff);
match_found = hasher->FindLongestMatch(
ringbuffer, literal_cost, ringbuffer_mask,
i + i_diff + 1, i_end - i - 1, max_distance,
&best_len_2, &best_len_code_2, &best_dist_2, &best_score_2,
&in_dictionary);
double cost_diff_lazy = 0;
if (best_len >= 4) {
cost_diff_lazy +=
literal_cost[(i + 4) & ringbuffer_mask] - average_cost;
}
{
const int tail_length = best_len_2 - best_len + 1;
for (int k = 0; k < tail_length; ++k) {
cost_diff_lazy -=
literal_cost[(i + best_len + k) & ringbuffer_mask] -
average_cost;
}
}
// If we are not inserting any symbols, inserting one is more
// expensive than if we were inserting symbols anyways.
if (insert_length < 1) {
cost_diff_lazy += 0.97;
}
// Add bias to slightly avoid lazy matching.
cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2;
cost_diff_lazy += 0.04 * literal_cost[i & ringbuffer_mask];
if (match_found && best_score_2 >= best_score + cost_diff_lazy) {
// Ok, let's just write one byte for now and start a match from the
// next byte.
++insert_length;
++delayed_backward_references_in_row;
best_len = best_len_2;
best_len_code = best_len_code_2;
best_dist = best_dist_2;
best_score = best_score_2;
best_in_dictionary = in_dictionary;
i++;
} else {
break;
}
}
}
Command cmd;
cmd.insert_length_ = insert_length;
cmd.copy_length_ = best_len;
cmd.copy_length_code_ = best_len_code;
cmd.copy_distance_ = best_dist;
commands->push_back(cmd);
hasher->set_last_distance(best_dist);
insert_length = 0;
++i;
// Copy all copied literals to the hasher, except the last one.
// We cannot store the last one yet, otherwise we couldn't find
// the possible M1 match.
for (int j = 1; j < best_len - 1; ++j) {
if (i + 2 < i_end) {
hasher->Store(ringbuffer + i, i + i_diff);
}
++i;
}
// Prepare M1 match.
if (best_len >= 4 && i + 20 < i_end && !best_in_dictionary) {
max_distance = std::min(i + i_diff, max_backward_limit);
match_found_M1 = hasher->FindLongestMatch(
ringbuffer, literal_cost, ringbuffer_mask,
i + i_diff, i_end - i, max_distance,
&best_len_M1, &best_len_code_M1, &best_dist_M1, &best_score_M1,
&in_dictionary);
} else {
match_found_M1 = false;
in_dictionary = false;
}
// This byte is just moved from the previous copy to the current,
// that is no gain.
best_score_M1 -= literal_cost[i & ringbuffer_mask];
// Adjust for losing the opportunity for lazy matching.
best_score_M1 -= 3.75;
// Store the last one of the match.
if (i + 2 < i_end) {
hasher->Store(ringbuffer + i, i + i_diff);
}
++i;
} else {
match_found_M1 = false;
++insert_length;
hasher->Store(ringbuffer + i, i + i_diff);
++i;
}
}
insert_length += (i_end - i);
if (insert_length > 0) {
Command cmd;
cmd.insert_length_ = insert_length;
cmd.copy_length_ = 0;
cmd.copy_distance_ = 0;
commands->push_back(cmd);
}
}
} // namespace brotli