普通文本  |  365行  |  14.39 KB

// Copyright 2008 Google Inc.
// Author: Lincoln Smith
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <config.h>
#include "encodetable.h"
#include <string>
#include "addrcache.h"
#include "codetable.h"
#include "instruction_map.h"
#include "logging.h"
#include "google/output_string.h"
#include "varint_bigendian.h"
#include "vcdiff_defs.h"

namespace open_vcdiff {

// VCDiffCodeTableWriter members and methods

// If interleaved is true, the encoder writes each delta file window
// by interleaving instructions and sizes with their corresponding
// addresses and data, rather than placing these elements into three
// separate sections.  This facilitates providing partially
// decoded results when only a portion of a delta file window
// is received (e.g. when HTTP over TCP is used as the
// transmission protocol.)  The interleaved format is
// not consistent with the VCDIFF draft standard.
//
VCDiffCodeTableWriter::VCDiffCodeTableWriter(bool interleaved)
    : max_mode_(VCDiffAddressCache::DefaultLastMode()),
      dictionary_size_(0),
      target_length_(0),
      code_table_data_(&VCDiffCodeTableData::kDefaultCodeTableData),
      instruction_map_(NULL),
      last_opcode_index_(-1),
      add_checksum_(false),
      checksum_(0),
      match_counts_(kMaxMatchSize, 0) {
  InitSectionPointers(interleaved);
}

VCDiffCodeTableWriter::VCDiffCodeTableWriter(
    bool interleaved,
    int near_cache_size,
    int same_cache_size,
    const VCDiffCodeTableData& code_table_data,
    unsigned char max_mode)
    : max_mode_(max_mode),
      address_cache_(near_cache_size, same_cache_size),
      dictionary_size_(0),
      target_length_(0),
      code_table_data_(&code_table_data),
      instruction_map_(NULL),
      last_opcode_index_(-1),
      add_checksum_(false),
      checksum_(0)  {
  InitSectionPointers(interleaved);
}

VCDiffCodeTableWriter::~VCDiffCodeTableWriter() {
  if (code_table_data_ != &VCDiffCodeTableData::kDefaultCodeTableData) {
    delete instruction_map_;
  }
}

void VCDiffCodeTableWriter::InitSectionPointers(bool interleaved) {
  if (interleaved) {
    data_for_add_and_run_ = &instructions_and_sizes_;
    addresses_for_copy_ = &instructions_and_sizes_;
  } else {
    data_for_add_and_run_ = &separate_data_for_add_and_run_;
    addresses_for_copy_ = &separate_addresses_for_copy_;
  }
}

bool VCDiffCodeTableWriter::Init(size_t dictionary_size) {
  dictionary_size_ = dictionary_size;
  if (!instruction_map_) {
    if (code_table_data_ == &VCDiffCodeTableData::kDefaultCodeTableData) {
      instruction_map_ = VCDiffInstructionMap::GetDefaultInstructionMap();
    } else {
      instruction_map_ = new VCDiffInstructionMap(*code_table_data_, max_mode_);
    }
    if (!instruction_map_) {
      return false;
    }
  }
  if (!address_cache_.Init()) {
    return false;
  }
  target_length_ = 0;
  last_opcode_index_ = -1;
  return true;
}

// The VCDiff format allows each opcode to represent either
// one or two delta instructions.  This function will first
// examine the opcode generated by the last call to EncodeInstruction.
// If that opcode was a single-instruction opcode, this function checks
// whether there is a compound (double-instruction) opcode that can
// combine that single instruction with the instruction that is now
// being added, and so save a byte of space.  In that case, the
// single-instruction opcode at position last_opcode_index_ will be
// overwritten with the new double-instruction opcode.
//
// In the majority of cases, no compound opcode will be possible,
// and a new single-instruction opcode will be appended to
// instructions_and_sizes_, followed by a representation of its size
// if the opcode does not implicitly give the instruction size.
//
// As an example, say instructions_and_sizes_ contains 10 bytes, the last
// of which contains the opcode 0x02 (ADD size 1).  Because that was the
// most recently added opcode, last_opcode_index_ has the value 10.
// EncodeInstruction is then called with inst = VCD_COPY, size = 4, mode = 0.
// The function will replace the old opcode 0x02 with the double-instruction
// opcode 0xA3 (ADD size 1 + COPY size 4 mode 0).
//
// All of the double-instruction opcodes in the standard code table
// have implicit sizes, meaning that the size of the instruction will not
// need to be written to the instructions_and_sizes_ string separately
// from the opcode.  If a custom code table were used that did not have
// this property, then instructions_and_sizes_ might contain a
// double-instruction opcode (say, COPY size 0 mode 0 + ADD size 0)
// followed by the size of the COPY, then by the size of the ADD.
// If using the SDCH interleaved format, the address of the COPY instruction
// would follow its size, so the ordering would be
// [Compound Opcode][Size of COPY][Address of COPY][Size of ADD]
//
void VCDiffCodeTableWriter::EncodeInstruction(VCDiffInstructionType inst,
                                              size_t size,
                                              unsigned char mode) {
  if (!instruction_map_) {
    LOG(DFATAL) << "EncodeInstruction() called without calling Init()"
                << LOG_ENDL;
    return;
  }
  if (last_opcode_index_ >= 0) {
    const unsigned char last_opcode =
        instructions_and_sizes_[last_opcode_index_];
    // The encoding engine should not generate two ADD instructions in a row.
    // This won't cause a failure, but it's inefficient encoding and probably
    // represents a bug in the higher-level logic of the encoder.
    if ((inst == VCD_ADD) &&
        (code_table_data_->inst1[last_opcode] == VCD_ADD)) {
      LOG(WARNING) << "EncodeInstruction() called for two ADD instructions"
                      " in a row" << LOG_ENDL;
    }
    OpcodeOrNone compound_opcode = kNoOpcode;
    if (size <= UCHAR_MAX) {
      compound_opcode =
          instruction_map_->LookupSecondOpcode(last_opcode,
                                               inst,
                                               static_cast<unsigned char>(size),
                                               mode);
      if (compound_opcode != kNoOpcode) {
        instructions_and_sizes_[last_opcode_index_] =
            static_cast<unsigned char>(compound_opcode);
        last_opcode_index_ = -1;
        return;
      }
    }
    // Try finding a compound opcode with size 0.
    compound_opcode = instruction_map_->LookupSecondOpcode(last_opcode,
                                                           inst,
                                                           0,
                                                           mode);
    if (compound_opcode != kNoOpcode) {
      instructions_and_sizes_[last_opcode_index_] =
          static_cast<unsigned char>(compound_opcode);
      last_opcode_index_ = -1;
      AppendSizeToString(size, &instructions_and_sizes_);
      return;
    }
  }
  OpcodeOrNone opcode = kNoOpcode;
  if (size <= UCHAR_MAX) {
    opcode =
        instruction_map_->LookupFirstOpcode(inst,
                                            static_cast<unsigned char>(size),
                                            mode);
    if (opcode != kNoOpcode) {
      instructions_and_sizes_.push_back(static_cast<char>(opcode));
      last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
      return;
    }
  }
  // There should always be an opcode with size 0.
  opcode = instruction_map_->LookupFirstOpcode(inst, 0, mode);
  if (opcode == kNoOpcode) {
    LOG(DFATAL) << "No matching opcode found for inst " << inst
                << ", mode " << mode << ", size 0" << LOG_ENDL;
    return;
  }
  instructions_and_sizes_.push_back(static_cast<char>(opcode));
  last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
  AppendSizeToString(size, &instructions_and_sizes_);
}

void VCDiffCodeTableWriter::Add(const char* data, size_t size) {
  EncodeInstruction(VCD_ADD, size);
  data_for_add_and_run_->append(data, size);
  target_length_ += size;
}

void VCDiffCodeTableWriter::Copy(int32_t offset, size_t size) {
  if (!instruction_map_) {
    LOG(DFATAL) << "VCDiffCodeTableWriter::Copy() called without calling Init()"
                << LOG_ENDL;
    return;
  }
  // If a single interleaved stream of encoded values is used
  // instead of separate sections for instructions, addresses, and data,
  // then the string instructions_and_sizes_ may be the same as
  // addresses_for_copy_.  The address should therefore be encoded
  // *after* the instruction and its size.
  int32_t encoded_addr = 0;
  const unsigned char mode = address_cache_.EncodeAddress(
      offset,
      static_cast<VCDAddress>(dictionary_size_ + target_length_),
      &encoded_addr);
  EncodeInstruction(VCD_COPY, size, mode);
  if (address_cache_.WriteAddressAsVarintForMode(mode)) {
    VarintBE<int32_t>::AppendToString(encoded_addr, addresses_for_copy_);
  } else {
    addresses_for_copy_->push_back(static_cast<unsigned char>(encoded_addr));
  }
  target_length_ += size;
  if (size >= match_counts_.size()) {
    match_counts_.resize(size * 2, 0);  // Be generous to avoid resizing again
  }
  ++match_counts_[size];
}

void VCDiffCodeTableWriter::Run(size_t size, unsigned char byte) {
  EncodeInstruction(VCD_RUN, size);
  data_for_add_and_run_->push_back(byte);
  target_length_ += size;
}

size_t VCDiffCodeTableWriter::CalculateLengthOfSizeAsVarint(size_t size) {
  return VarintBE<int32_t>::Length(static_cast<int32_t>(size));
}

void VCDiffCodeTableWriter::AppendSizeToString(size_t size, string* out) {
  VarintBE<int32_t>::AppendToString(static_cast<int32_t>(size), out);
}

void VCDiffCodeTableWriter::AppendSizeToOutputString(
    size_t size,
    OutputStringInterface* out) {
  VarintBE<int32_t>::AppendToOutputString(static_cast<int32_t>(size), out);
}

// This calculation must match the items added between "Start of Delta Encoding"
// and "End of Delta Encoding" in Output(), below.
size_t VCDiffCodeTableWriter::CalculateLengthOfTheDeltaEncoding() const {
  size_t length_of_the_delta_encoding =
    CalculateLengthOfSizeAsVarint(target_length_) +
    1 +  // Delta_Indicator
    CalculateLengthOfSizeAsVarint(separate_data_for_add_and_run_.size()) +
    CalculateLengthOfSizeAsVarint(instructions_and_sizes_.size()) +
    CalculateLengthOfSizeAsVarint(separate_addresses_for_copy_.size()) +
    separate_data_for_add_and_run_.size() +
    instructions_and_sizes_.size() +
    separate_addresses_for_copy_.size();
  if (add_checksum_) {
    length_of_the_delta_encoding +=
        VarintBE<int64_t>::Length(static_cast<int64_t>(checksum_));
  }
  return length_of_the_delta_encoding;
}

void VCDiffCodeTableWriter::Output(OutputStringInterface* out) {
  if (instructions_and_sizes_.empty()) {
    LOG(WARNING) << "Empty input; no delta window produced" << LOG_ENDL;
  } else {
    const size_t length_of_the_delta_encoding =
        CalculateLengthOfTheDeltaEncoding();
    const size_t delta_window_size =
        length_of_the_delta_encoding +
        1 +  // Win_Indicator
        CalculateLengthOfSizeAsVarint(dictionary_size_) +
        CalculateLengthOfSizeAsVarint(0) +
        CalculateLengthOfSizeAsVarint(length_of_the_delta_encoding);
    // append() will be called many times on the output string; make sure
    // the output string is resized only once at most.
    out->ReserveAdditionalBytes(delta_window_size);

    // Add first element: Win_Indicator
    if (add_checksum_) {
      out->push_back(VCD_SOURCE | VCD_CHECKSUM);
    } else {
      out->push_back(VCD_SOURCE);
    }
    // Source segment size: dictionary size
    AppendSizeToOutputString(dictionary_size_, out);
    // Source segment position: 0 (start of dictionary)
    AppendSizeToOutputString(0, out);

    // [Here is where a secondary compressor would be used
    //  if the encoder and decoder supported that feature.]

    AppendSizeToOutputString(length_of_the_delta_encoding, out);
    // Start of Delta Encoding
    const size_t size_before_delta_encoding = out->size();
    AppendSizeToOutputString(target_length_, out);
    out->push_back(0x00);  // Delta_Indicator: no compression
    AppendSizeToOutputString(separate_data_for_add_and_run_.size(), out);
    AppendSizeToOutputString(instructions_and_sizes_.size(), out);
    AppendSizeToOutputString(separate_addresses_for_copy_.size(), out);
    if (add_checksum_) {
      // The checksum is a 32-bit *unsigned* integer.  VarintBE requires a
      // signed type, so use a 64-bit signed integer to store the checksum.
      VarintBE<int64_t>::AppendToOutputString(static_cast<int64_t>(checksum_),
                                              out);
    }
    out->append(separate_data_for_add_and_run_.data(),
                separate_data_for_add_and_run_.size());
    out->append(instructions_and_sizes_.data(),
                instructions_and_sizes_.size());
    out->append(separate_addresses_for_copy_.data(),
                separate_addresses_for_copy_.size());
    // End of Delta Encoding
    const size_t size_after_delta_encoding = out->size();
    if (length_of_the_delta_encoding !=
        (size_after_delta_encoding - size_before_delta_encoding)) {
      LOG(DFATAL) << "Internal error: calculated length of the delta encoding ("
                  << length_of_the_delta_encoding
                  << ") does not match actual length ("
                  << (size_after_delta_encoding - size_before_delta_encoding)
                  << LOG_ENDL;
    }
    separate_data_for_add_and_run_.clear();
    instructions_and_sizes_.clear();
    separate_addresses_for_copy_.clear();
    if (target_length_ == 0) {
      LOG(WARNING) << "Empty target window" << LOG_ENDL;
    }
  }

  // Reset state for next window; assume we are using same code table
  // and dictionary.  The caller will have to invoke Init() if a different
  // dictionary is used.
  //
  // Notably, Init() calls address_cache_.Init().  This resets the address
  // cache between delta windows, as required by RFC section 5.1.
  if (!Init(dictionary_size_)) {
    LOG(DFATAL) << "Internal error: calling Init() to reset "
                   "VCDiffCodeTableWriter state failed" << LOG_ENDL;
  }
}

};  // namespace open_vcdiff