// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "net/http/http_content_disposition.h" #include "base/base64.h" #include "base/i18n/icu_string_conversions.h" #include "base/logging.h" #include "base/strings/string_tokenizer.h" #include "base/strings/string_util.h" #include "base/strings/sys_string_conversions.h" #include "base/strings/utf_string_conversions.h" #include "net/base/net_util.h" #include "net/http/http_util.h" #include "third_party/icu/source/common/unicode/ucnv.h" namespace { enum RFC2047EncodingType { Q_ENCODING, B_ENCODING }; // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to // decoding a quoted-printable string. Returns true if the input was valid. bool DecodeQEncoding(const std::string& input, std::string* output) { std::string temp; temp.reserve(input.size()); for (std::string::const_iterator it = input.begin(); it != input.end(); ++it) { if (*it == '_') { temp.push_back(' '); } else if (*it == '=') { if ((input.end() - it < 3) || !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) return false; unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + HexDigitToInt(*(it + 2)); temp.push_back(static_cast<char>(ch)); ++it; ++it; } else if (0x20 < *it && *it < 0x7F && *it != '?') { // In a Q-encoded word, only printable ASCII characters // represent themselves. Besides, space, '=', '_' and '?' are // not allowed, but they're already filtered out. DCHECK_NE('=', *it); DCHECK_NE('?', *it); DCHECK_NE('_', *it); temp.push_back(*it); } else { return false; } } output->swap(temp); return true; } // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding // type is specified in |enc_type|. bool DecodeBQEncoding(const std::string& part, RFC2047EncodingType enc_type, const std::string& charset, std::string* output) { std::string decoded; if (!((enc_type == B_ENCODING) ? base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) return false; if (decoded.empty()) { output->clear(); return true; } UErrorCode err = U_ZERO_ERROR; UConverter* converter(ucnv_open(charset.c_str(), &err)); if (U_FAILURE(err)) return false; // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a // trailing '\0'. size_t output_length = decoded.length() * 3 + 1; char* buf = WriteInto(output, output_length); output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, decoded.data(), decoded.length(), &err); ucnv_close(converter); if (U_FAILURE(err)) return false; output->resize(output_length); return true; } bool DecodeWord(const std::string& encoded_word, const std::string& referrer_charset, bool* is_rfc2047, std::string* output, int* parse_result_flags) { *is_rfc2047 = false; output->clear(); if (encoded_word.empty()) return true; if (!IsStringASCII(encoded_word)) { // Try UTF-8, referrer_charset and the native OS default charset in turn. if (IsStringUTF8(encoded_word)) { *output = encoded_word; } else { base::string16 utf16_output; if (!referrer_charset.empty() && base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), base::OnStringConversionError::FAIL, &utf16_output)) { *output = UTF16ToUTF8(utf16_output); } else { *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); } } *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS; return true; } // RFC 2047 : one of encoding methods supported by Firefox and relatively // widely used by web servers. // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. // We don't care about the length restriction (72 bytes) because // many web servers generate encoded words longer than the limit. std::string decoded_word; *is_rfc2047 = true; int part_index = 0; std::string charset; base::StringTokenizer t(encoded_word, "?"); RFC2047EncodingType enc_type = Q_ENCODING; while (*is_rfc2047 && t.GetNext()) { std::string part = t.token(); switch (part_index) { case 0: if (part != "=") { *is_rfc2047 = false; break; } ++part_index; break; case 1: // Do we need charset validity check here? charset = part; ++part_index; break; case 2: if (part.size() > 1 || part.find_first_of("bBqQ") == std::string::npos) { *is_rfc2047 = false; break; } if (part[0] == 'b' || part[0] == 'B') { enc_type = B_ENCODING; } ++part_index; break; case 3: *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word); if (!*is_rfc2047) { // Last minute failure. Invalid B/Q encoding. Rather than // passing it through, return now. return false; } ++part_index; break; case 4: if (part != "=") { // Another last minute failure ! // Likely to be a case of two encoded-words in a row or // an encoded word followed by a non-encoded word. We can be // generous, but it does not help much in terms of compatibility, // I believe. Return immediately. *is_rfc2047 = false; return false; } ++part_index; break; default: *is_rfc2047 = false; return false; } } if (*is_rfc2047) { if (*(encoded_word.end() - 1) == '=') { output->swap(decoded_word); *parse_result_flags |= net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS; return true; } // encoded_word ending prematurelly with '?' or extra '?' *is_rfc2047 = false; return false; } // We're not handling 'especial' characters quoted with '\', but // it should be Ok because we're not an email client but a // web browser. // What IE6/7 does: %-escaped UTF-8. decoded_word = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES); if (decoded_word != encoded_word) *parse_result_flags |= net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS; if (IsStringUTF8(decoded_word)) { output->swap(decoded_word); return true; // We can try either the OS default charset or 'origin charset' here, // As far as I can tell, IE does not support it. However, I've seen // web servers emit %-escaped string in a legacy encoding (usually // origin charset). // TODO(jungshik) : Test IE further and consider adding a fallback here. } return false; } // Decodes the value of a 'filename' or 'name' parameter given as |input|. The // value is supposed to be of the form: // // value = token | quoted-string // // However we currently also allow RFC 2047 encoding and non-ASCII // strings. Non-ASCII strings are interpreted based on |referrer_charset|. bool DecodeFilenameValue(const std::string& input, const std::string& referrer_charset, std::string* output, int* parse_result_flags) { int current_parse_result_flags = 0; std::string decoded_value; bool is_previous_token_rfc2047 = true; // Tokenize with whitespace characters. base::StringTokenizer t(input, " \t\n\r"); t.set_options(base::StringTokenizer::RETURN_DELIMS); while (t.GetNext()) { if (t.token_is_delim()) { // If the previous non-delimeter token is not RFC2047-encoded, // put in a space in its place. Otheriwse, skip over it. if (!is_previous_token_rfc2047) decoded_value.push_back(' '); continue; } // We don't support a single multibyte character split into // adjacent encoded words. Some broken mail clients emit headers // with that problem, but most web servers usually encode a filename // in a single encoded-word. Firefox/Thunderbird do not support // it, either. std::string decoded; if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, &decoded, ¤t_parse_result_flags)) return false; decoded_value.append(decoded); } output->swap(decoded_value); if (parse_result_flags && !output->empty()) *parse_result_flags |= current_parse_result_flags; return true; } // Parses the charset and value-chars out of an ext-value string. // // ext-value = charset "'" [ language ] "'" value-chars bool ParseExtValueComponents(const std::string& input, std::string* charset, std::string* value_chars) { base::StringTokenizer t(input, "'"); t.set_options(base::StringTokenizer::RETURN_DELIMS); std::string temp_charset; std::string temp_value; int numDelimsSeen = 0; while (t.GetNext()) { if (t.token_is_delim()) { ++numDelimsSeen; continue; } else { switch (numDelimsSeen) { case 0: temp_charset = t.token(); break; case 1: // Language is ignored. break; case 2: temp_value = t.token(); break; default: return false; } } } if (numDelimsSeen != 2) return false; if (temp_charset.empty() || temp_value.empty()) return false; charset->swap(temp_charset); value_chars->swap(temp_value); return true; } // http://tools.ietf.org/html/rfc5987#section-3.2 // // ext-value = charset "'" [ language ] "'" value-chars // // charset = "UTF-8" / "ISO-8859-1" / mime-charset // // mime-charset = 1*mime-charsetc // mime-charsetc = ALPHA / DIGIT // / "!" / "#" / "$" / "%" / "&" // / "+" / "-" / "^" / "_" / "`" // / "{" / "}" / "~" // // language = <Language-Tag, defined in [RFC5646], Section 2.1> // // value-chars = *( pct-encoded / attr-char ) // // pct-encoded = "%" HEXDIG HEXDIG // // attr-char = ALPHA / DIGIT // / "!" / "#" / "$" / "&" / "+" / "-" / "." // / "^" / "_" / "`" / "|" / "~" bool DecodeExtValue(const std::string& param_value, std::string* decoded) { if (param_value.find('"') != std::string::npos) return false; std::string charset; std::string value; if (!ParseExtValueComponents(param_value, &charset, &value)) return false; // RFC 5987 value should be ASCII-only. if (!IsStringASCII(value)) { decoded->clear(); return true; } std::string unescaped = net::UnescapeURLComponent( value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); } } // namespace namespace net { HttpContentDisposition::HttpContentDisposition( const std::string& header, const std::string& referrer_charset) : type_(INLINE), parse_result_flags_(INVALID) { Parse(header, referrer_charset); } HttpContentDisposition::~HttpContentDisposition() { } std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( std::string::const_iterator begin, std::string::const_iterator end) { DCHECK(type_ == INLINE); std::string::const_iterator delimiter = std::find(begin, end, ';'); std::string::const_iterator type_begin = begin; std::string::const_iterator type_end = delimiter; HttpUtil::TrimLWS(&type_begin, &type_end); // If the disposition-type isn't a valid token the then the // Content-Disposition header is malformed, and we treat the first bytes as // a parameter rather than a disposition-type. if (!HttpUtil::IsToken(type_begin, type_end)) return begin; parse_result_flags_ |= HAS_DISPOSITION_TYPE; DCHECK(std::find(type_begin, type_end, '=') == type_end); if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) { type_ = INLINE; } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) { type_ = ATTACHMENT; } else { parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE; type_ = ATTACHMENT; } return delimiter; } // http://tools.ietf.org/html/rfc6266 // // content-disposition = "Content-Disposition" ":" // disposition-type *( ";" disposition-parm ) // // disposition-type = "inline" | "attachment" | disp-ext-type // ; case-insensitive // disp-ext-type = token // // disposition-parm = filename-parm | disp-ext-parm // // filename-parm = "filename" "=" value // | "filename*" "=" ext-value // // disp-ext-parm = token "=" value // | ext-token "=" ext-value // ext-token = <the characters in token, followed by "*"> // void HttpContentDisposition::Parse(const std::string& header, const std::string& referrer_charset) { DCHECK(type_ == INLINE); DCHECK(filename_.empty()); std::string::const_iterator pos = header.begin(); std::string::const_iterator end = header.end(); pos = ConsumeDispositionType(pos, end); std::string name; std::string filename; std::string ext_filename; HttpUtil::NameValuePairsIterator iter(pos, end, ';'); while (iter.GetNext()) { if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), iter.name_end(), "filename")) { DecodeFilenameValue(iter.value(), referrer_charset, &filename, &parse_result_flags_); if (!filename.empty()) parse_result_flags_ |= HAS_FILENAME; } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(), iter.name_end(), "name")) { DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL); if (!name.empty()) parse_result_flags_ |= HAS_NAME; } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), iter.name_end(), "filename*")) { DecodeExtValue(iter.raw_value(), &ext_filename); if (!ext_filename.empty()) parse_result_flags_ |= HAS_EXT_FILENAME; } } if (!ext_filename.empty()) filename_ = ext_filename; else if (!filename.empty()) filename_ = filename; else filename_ = name; } } // namespace net