// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <stdlib.h>
#include "base/logging.h"
#include "base/string_util.h"
#include "net/base/net_util.h"
#include "net/tools/dump_cache/url_to_filename_encoder.h"
using std::string;
namespace {
// Returns 1 if buf is prefixed by "num_digits" of hex digits
// Teturns 0 otherwise.
// The function checks for '\0' for string termination.
int HexDigitsPrefix(const char* buf, int num_digits) {
for (int i = 0; i < num_digits; i++) {
if (!IsHexDigit(buf[i]))
return 0; // This also detects end of string as '\0' is not xdigit.
}
return 1;
}
#ifdef WIN32
#define strtoull _strtoui64
#endif
// A simple parser for long long values. Returns the parsed value if a
// valid integer is found; else returns deflt
// UInt64 and Int64 cannot handle decimal numbers with leading 0s.
uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) {
char *error = NULL;
const uint64 value = strtoull(str, &error, 16);
return (error == str) ? deflt : value;
}
}
namespace net {
// The escape character choice is made here -- all code and tests in this
// directory are based off of this constant. However, our testdata
// has tons of dependencies on this, so it cannot be changed without
// re-running those tests and fixing them.
const char UrlToFilenameEncoder::kEscapeChar = ',';
const char UrlToFilenameEncoder::kTruncationChar = '-';
const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128;
void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) {
CHECK(!segment->empty());
if ((*segment == ".") || (*segment == "..")) {
dest->append(1, kEscapeChar);
dest->append(*segment);
segment->clear();
} else {
size_t segment_size = segment->size();
if (segment_size > kMaximumSubdirectoryLength) {
// We need to inject ",-" at the end of the segment to signify that
// we are inserting an artificial '/'. This means we have to chop
// off at least two characters to make room.
segment_size = kMaximumSubdirectoryLength - 2;
// But we don't want to break up an escape sequence that happens to lie at
// the end. Escape sequences are at most 2 characters.
if ((*segment)[segment_size - 1] == kEscapeChar) {
segment_size -= 1;
} else if ((*segment)[segment_size - 2] == kEscapeChar) {
segment_size -= 2;
}
dest->append(segment->data(), segment_size);
dest->append(1, kEscapeChar);
dest->append(1, kTruncationChar);
segment->erase(0, segment_size);
// At this point, if we had segment_size=3, and segment="abcd",
// then after this erase, we will have written "abc,-" and set segment="d"
} else {
dest->append(*segment);
segment->clear();
}
}
}
void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix,
const string& escaped_ending,
char dir_separator,
string* encoded_filename) {
string filename_ending = UrlUtilities::Unescape(escaped_ending);
char encoded[3];
int encoded_len;
string segment;
// TODO(jmarantz): This code would be a bit simpler if we disallowed
// Instaweb allowing filename_prefix to not end in "/". We could
// then change the is routine to just take one input string.
size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
if (start_of_segment == string::npos) {
segment = filename_prefix;
} else {
segment = filename_prefix.substr(start_of_segment + 1);
*encoded_filename = filename_prefix.substr(0, start_of_segment + 1);
}
size_t index = 0;
// Special case the first / to avoid adding a leading kEscapeChar.
if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
encoded_filename->append(segment);
segment.clear();
encoded_filename->append(1, dir_separator);
++index;
}
for (; index < filename_ending.length(); ++index) {
unsigned char ch = static_cast<unsigned char>(filename_ending[index]);
// Note: instead of outputing an empty segment, we let the second slash
// be escaped below.
if ((ch == dir_separator) && !segment.empty()) {
AppendSegment(&segment, encoded_filename);
encoded_filename->append(1, dir_separator);
segment.clear();
} else {
// After removing unsafe chars the only safe ones are _.=+- and alphanums.
if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') ||
(ch == '-') || (('0' <= ch) && (ch <= '9')) ||
(('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) {
encoded[0] = ch;
encoded_len = 1;
} else {
encoded[0] = kEscapeChar;
encoded[1] = ch / 16;
encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
encoded[2] = ch % 16;
encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
encoded_len = 3;
}
segment.append(encoded, encoded_len);
// If segment is too big, we must chop it into chunks.
if (segment.size() > kMaximumSubdirectoryLength) {
AppendSegment(&segment, encoded_filename);
encoded_filename->append(1, dir_separator);
}
}
}
// Append "," to the leaf filename so the leaf can also be a branch., e.g.
// allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
// /a/b/c/d". So we will rename the "d" here to "d,". If doing that pushed
// us over the 128 char limit, then we will need to append "/" and the
// remaining chars.
segment += kEscapeChar;
AppendSegment(&segment, encoded_filename);
if (!segment.empty()) {
// The last overflow segment is special, because we appended in
// kEscapeChar above. We won't need to check it again for size
// or further escaping.
encoded_filename->append(1, dir_separator);
encoded_filename->append(segment);
}
}
// Note: this decoder is not the exact inverse of the EncodeSegment above,
// because it does not take into account a prefix.
bool UrlToFilenameEncoder::Decode(const string& encoded_filename,
char dir_separator,
string* decoded_url) {
enum State {
kStart,
kEscape,
kFirstDigit,
kTruncate,
kEscapeDot
};
State state = kStart;
int char_code = 0;
char hex_buffer[3];
hex_buffer[2] = '\0';
for (size_t i = 0; i < encoded_filename.size(); ++i) {
char ch = encoded_filename[i];
switch (state) {
case kStart:
if (ch == kEscapeChar) {
state = kEscape;
} else if (ch == dir_separator) {
decoded_url->append(1, '/'); // URLs only use '/' not '\\'
} else {
decoded_url->append(1, ch);
}
break;
case kEscape:
if (HexDigitsPrefix(&ch, 1) == 1) {
hex_buffer[0] = ch;
state = kFirstDigit;
} else if (ch == kTruncationChar) {
state = kTruncate;
} else if (ch == '.') {
decoded_url->append(1, '.');
state = kEscapeDot; // Look for at most one more dot.
} else if (ch == dir_separator) {
// Consider url "//x". This was once encoded to "/,/x,".
// This code is what skips the first Escape.
decoded_url->append(1, '/'); // URLs only use '/' not '\\'
state = kStart;
} else {
return false;
}
break;
case kFirstDigit:
if (HexDigitsPrefix(&ch, 1) == 1) {
hex_buffer[1] = ch;
uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0);
decoded_url->append(1, static_cast<char>(hex_value));
char_code = 0;
state = kStart;
} else {
return false;
}
break;
case kTruncate:
if (ch == dir_separator) {
// Skip this separator, it was only put in to break up long
// path segments, but is not part of the URL.
state = kStart;
} else {
return false;
}
break;
case kEscapeDot:
decoded_url->append(1, ch);
state = kStart;
break;
}
}
// All legal encoded filenames end in kEscapeChar.
return (state == kEscape);
}
// Escape the given input |path| and chop any individual components
// of the path which are greater than kMaximumSubdirectoryLength characters
// into two chunks.
//
// This legacy version has several issues with aliasing of different URLs,
// inability to represent both /a/b/c and /a/b/c/d, and inability to decode
// the filenames back into URLs.
//
// But there is a large body of slurped data which depends on this format,
// so leave it as the default for spdy_in_mem_edsm_server.
string UrlToFilenameEncoder::LegacyEscape(const string& path) {
string output;
// Note: We also chop paths into medium sized 'chunks'.
// This is due to the incompetence of the windows
// filesystem, which still hasn't figured out how
// to deal with long filenames.
int last_slash = 0;
for (size_t index = 0; index < path.length(); index++) {
char ch = path[index];
if (ch == 0x5C)
last_slash = index;
if ((ch == 0x2D) || // hyphen
(ch == 0x5C) || (ch == 0x5F) || // backslash, underscore
((0x30 <= ch) && (ch <= 0x39)) || // Digits [0-9]
((0x41 <= ch) && (ch <= 0x5A)) || // Uppercase [A-Z]
((0x61 <= ch) && (ch <= 0x7A))) { // Lowercase [a-z]
output.append(&path[index], 1);
} else {
char encoded[3];
encoded[0] = 'x';
encoded[1] = ch / 16;
encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
encoded[2] = ch % 16;
encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
output.append(encoded, 3);
}
if (index - last_slash > kMaximumSubdirectoryLength) {
#ifdef WIN32
char slash = '\\';
#else
char slash = '/';
#endif
output.append(&slash, 1);
last_slash = index;
}
}
return output;
}
} // namespace net