// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "net/tools/flip_server/balsa_frame.h" #include <assert.h> #include <emmintrin.h> #include <strings.h> #include <limits> #include <iostream> #include <string> #include <utility> #include <vector> #include "base/logging.h" #include "base/port.h" #include "base/string_piece.h" #include "net/tools/flip_server/balsa_enums.h" #include "net/tools/flip_server/balsa_headers.h" #include "net/tools/flip_server/balsa_visitor_interface.h" #include "net/tools/flip_server/buffer_interface.h" #include "net/tools/flip_server/simple_buffer.h" #include "net/tools/flip_server/split.h" #include "net/tools/flip_server/string_piece_utils.h" namespace net { // Constants holding some header names for headers which can affect the way the // HTTP message is framed, and so must be processed specially: static const char kContentLength[] = "content-length"; static const size_t kContentLengthSize = sizeof(kContentLength) - 1; static const char kTransferEncoding[] = "transfer-encoding"; static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1; void BalsaFrame::Reset() { last_char_was_slash_r_ = false; saw_non_newline_char_ = false; start_was_space_ = true; chunk_length_character_extracted_ = false; // is_request_ = true; // not reset between messages. // request_was_head_ = false; // not reset between messages. // max_header_length_ = 4096; // not reset between messages. // max_request_uri_length_ = 2048; // not reset between messages. // visitor_ = &do_nothing_visitor_; // not reset between messages. chunk_length_remaining_ = 0; content_length_remaining_ = 0; last_slash_n_loc_ = NULL; last_recorded_slash_n_loc_ = NULL; last_slash_n_idx_ = 0; term_chars_ = 0; parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE; last_error_ = BalsaFrameEnums::NO_ERROR; lines_.clear(); if (headers_ != NULL) { headers_->Clear(); } } const char* BalsaFrameEnums::ParseStateToString( BalsaFrameEnums::ParseState error_code) { switch (error_code) { case ERROR: return "ERROR"; case READING_HEADER_AND_FIRSTLINE: return "READING_HEADER_AND_FIRSTLINE"; case READING_CHUNK_LENGTH: return "READING_CHUNK_LENGTH"; case READING_CHUNK_EXTENSION: return "READING_CHUNK_EXTENSION"; case READING_CHUNK_DATA: return "READING_CHUNK_DATA"; case READING_CHUNK_TERM: return "READING_CHUNK_TERM"; case READING_LAST_CHUNK_TERM: return "READING_LAST_CHUNK_TERM"; case READING_TRAILER: return "READING_TRAILER"; case READING_UNTIL_CLOSE: return "READING_UNTIL_CLOSE"; case READING_CONTENT: return "READING_CONTENT"; case MESSAGE_FULLY_READ: return "MESSAGE_FULLY_READ"; case NUM_STATES: return "UNKNOWN_STATE"; } return "UNKNOWN_STATE"; } const char* BalsaFrameEnums::ErrorCodeToString( BalsaFrameEnums::ErrorCode error_code) { switch (error_code) { case NO_ERROR: return "NO_ERROR"; case NO_STATUS_LINE_IN_RESPONSE: return "NO_STATUS_LINE_IN_RESPONSE"; case NO_REQUEST_LINE_IN_REQUEST: return "NO_REQUEST_LINE_IN_REQUEST"; case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION: return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION"; case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD: return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD"; case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE: return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE"; case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI: return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI"; case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE: return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE"; case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION: return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION"; case FAILED_CONVERTING_STATUS_CODE_TO_INT: return "FAILED_CONVERTING_STATUS_CODE_TO_INT"; case REQUEST_URI_TOO_LONG: return "REQUEST_URI_TOO_LONG"; case HEADERS_TOO_LONG: return "HEADERS_TOO_LONG"; case UNPARSABLE_CONTENT_LENGTH: return "UNPARSABLE_CONTENT_LENGTH"; case MAYBE_BODY_BUT_NO_CONTENT_LENGTH: return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH"; case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH: return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH"; case HEADER_MISSING_COLON: return "HEADER_MISSING_COLON"; case INVALID_CHUNK_LENGTH: return "INVALID_CHUNK_LENGTH"; case CHUNK_LENGTH_OVERFLOW: return "CHUNK_LENGTH_OVERFLOW"; case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO: return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO"; case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT: return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT"; case MULTIPLE_CONTENT_LENGTH_KEYS: return "MULTIPLE_CONTENT_LENGTH_KEYS"; case MULTIPLE_TRANSFER_ENCODING_KEYS: return "MULTIPLE_TRANSFER_ENCODING_KEYS"; case UNKNOWN_TRANSFER_ENCODING: return "UNKNOWN_TRANSFER_ENCODING"; case INVALID_HEADER_FORMAT: return "INVALID_HEADER_FORMAT"; case INTERNAL_LOGIC_ERROR: return "INTERNAL_LOGIC_ERROR"; case NUM_ERROR_CODES: return "UNKNOWN_ERROR"; } return "UNKNOWN_ERROR"; } // Summary: // Parses the first line of either a request or response. // Note that in the case of a detected warning, error_code will be set // but the function will not return false. // Exactly zero or one warning or error (but not both) may be detected // by this function. // Note that this function will not write the data of the first-line // into the header's buffer (that should already have been done elsewhere). // // Pre-conditions: // begin != end // *begin should be a character which is > ' '. This implies that there // is at least one non-whitespace characters between [begin, end). // headers is a valid pointer to a BalsaHeaders class. // error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value. // Entire first line must exist between [begin, end) // Exactly zero or one newlines -may- exist between [begin, end) // [begin, end) should exist in the header's buffer. // // Side-effects: // headers will be modified // error_code may be modified if either a warning or error is detected // // Returns: // True if no error (as opposed to warning) is detected. // False if an error (as opposed to warning) is detected. // // If there is indeed non-whitespace in the line, then the following // will take care of this for you: // while (*begin <= ' ') ++begin; // ProcessFirstLine(begin, end, is_request, &headers, &error_code); // bool ParseHTTPFirstLine(const char* begin, const char* end, bool is_request, size_t max_request_uri_length, BalsaHeaders* headers, BalsaFrameEnums::ErrorCode* error_code) { const char* current = begin; // HTTP firstlines all have the following structure: // LWS NONWS LWS NONWS LWS NONWS NOTCRLF CRLF // [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n" // ws1 nws1 ws2 nws2 ws3 nws3 ws4 // | [-------) [-------) [----------------) // REQ: method request_uri version // RESP: version statuscode reason // // The first NONWS->LWS component we'll call firstline_a. // The second firstline_b, and the third firstline_c. // // firstline_a goes from nws1 to (but not including) ws2 // firstline_b goes from nws2 to (but not including) ws3 // firstline_c goes from nws3 to (but not including) ws4 // // In the code: // ws1 == whitespace_1_idx_ // nws1 == non_whitespace_1_idx_ // ws2 == whitespace_2_idx_ // nws2 == non_whitespace_2_idx_ // ws3 == whitespace_3_idx_ // nws3 == non_whitespace_3_idx_ // ws4 == whitespace_4_idx_ // Kill all whitespace (including '\r\n') at the end of the line. --end; if (*end != '\n') { *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" << headers->OriginalHeadersForDebugging(); return false; } while (begin < end && *end <= ' ') { --end; } DCHECK(*end != '\n'); if (*end == '\n') { *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" << headers->OriginalHeadersForDebugging(); return false; } ++end; // The two following statements should not be possible. if (end == begin) { *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" << headers->OriginalHeadersForDebugging(); return false; } // whitespace_1_idx_ headers->whitespace_1_idx_ = current - begin; // This loop is commented out as it is never used in current code. This is // true only because we don't begin parsing the headers at all until we've // encountered a non whitespace character at the beginning of the stream, at // which point we begin our demarcation of header-start. If we did -not- do // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop // would be necessary for the proper functioning of this parsing. // This is left here as this function may (in the future) be refactored out // of the BalsaFrame class so that it may be shared between code in // BalsaFrame and BalsaHeaders (where it would be used in some variant of the // set_first_line() function (at which point it would be necessary). #if 0 while (*current <= ' ') { ++current; } #endif // non_whitespace_1_idx_ headers->non_whitespace_1_idx_ = current - begin; do { // The first time through, we're guaranteed that the current character // won't be a whitespace (else the loop above wouldn't have terminated). // That implies that we're guaranteed to get at least one non-whitespace // character if we get into this loop at all. ++current; if (current == end) { headers->whitespace_2_idx_ = current - begin; headers->non_whitespace_2_idx_ = current - begin; headers->whitespace_3_idx_ = current - begin; headers->non_whitespace_3_idx_ = current - begin; headers->whitespace_4_idx_ = current - begin; // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD for request // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response *error_code = static_cast<BalsaFrameEnums::ErrorCode>( BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION + is_request); if (!is_request) { // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION return false; } goto output_exhausted; } } while (*current > ' '); // whitespace_2_idx_ headers->whitespace_2_idx_ = current - begin; do { ++current; // Note that due to the loop which consumes all of the whitespace // at the end of the line, current can never == end while in this function. } while (*current <= ' '); // non_whitespace_2_idx_ headers->non_whitespace_2_idx_ = current - begin; do { ++current; if (current == end) { headers->whitespace_3_idx_ = current - begin; headers->non_whitespace_3_idx_ = current - begin; headers->whitespace_4_idx_ = current - begin; // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response *error_code = static_cast<BalsaFrameEnums::ErrorCode>( BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE + is_request); goto output_exhausted; } } while (*current > ' '); // whitespace_3_idx_ headers->whitespace_3_idx_ = current - begin; do { ++current; // Note that due to the loop which consumes all of the whitespace // at the end of the line, current can never == end while in this function. } while (*current <= ' '); // non_whitespace_3_idx_ headers->non_whitespace_3_idx_ = current - begin; headers->whitespace_4_idx_ = end - begin; output_exhausted: // Note that we don't fail the parse immediately when parsing of the // firstline fails. Depending on the protocol type, we may want to accept // a firstline with only one or two elements, e.g., for HTTP/0.9: // GET\r\n // or // GET /\r\n // should be parsed without issue (though the visitor should know that // parsing the entire line was not exactly as it should be). // // Eventually, these errors may be removed alltogether, as the visitor can // detect them on its own by examining the size of the various fields. // headers->set_first_line(non_whitespace_1_idx_, current); if (is_request) { if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) > max_request_uri_length) { // For requests, we need at least the method. We could assume that a // blank URI means "/". If version isn't stated, it should be assumed // to be HTTP/0.9 by the visitor. *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG; return false; } } else { headers->parsed_response_code_ = 0; { const char* parsed_response_code_current = begin + headers->non_whitespace_2_idx_; const char* parsed_response_code_end = begin + headers->whitespace_3_idx_; const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10; // Convert a string of [0-9]* into an int. // Note that this allows for the conversion of response codes which // are outside the bounds of normal HTTP response codes (no checking // is done to ensure that these are valid-- they're merely parsed)! while (parsed_response_code_current < parsed_response_code_end) { if (*parsed_response_code_current < '0' || *parsed_response_code_current > '9') { *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; return false; } size_t status_code_x_10 = headers->parsed_response_code_ * 10; uint8 c = *parsed_response_code_current - '0'; if ((headers->parsed_response_code_ > kMaxDiv10) || (std::numeric_limits<size_t>::max() - status_code_x_10) < c) { // overflow. *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; return false; } headers->parsed_response_code_ = status_code_x_10 + c; ++parsed_response_code_current; } } } return true; } // begin - beginning of the firstline // end - end of the firstline // // A precondition for this function is that there is non-whitespace between // [begin, end). If this precondition is not met, the function will not perform // as expected (and bad things may happen, and it will eat your first, second, // and third unborn children!). // // Another precondition for this function is that [begin, end) includes // at most one newline, which must be at the end of the line. void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) { BalsaFrameEnums::ErrorCode previous_error = last_error_; if (!ParseHTTPFirstLine(begin, end, is_request_, max_request_uri_length_, headers_, &last_error_)) { parse_state_ = BalsaFrameEnums::PARSE_ERROR; visitor_->HandleHeaderError(this); return; } if (previous_error != last_error_) { visitor_->HandleHeaderWarning(this); } if (is_request_) { int version_length = headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_; visitor_->ProcessRequestFirstLine( begin + headers_->non_whitespace_1_idx_, headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, begin + headers_->non_whitespace_1_idx_, headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, begin + headers_->non_whitespace_2_idx_, headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, begin + headers_->non_whitespace_3_idx_, version_length); if (version_length == 0) parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; } else { visitor_->ProcessResponseFirstLine( begin + headers_->non_whitespace_1_idx_, headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, begin + headers_->non_whitespace_1_idx_, headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, begin + headers_->non_whitespace_2_idx_, headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, begin + headers_->non_whitespace_3_idx_, headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_); } } // 'stream_begin' points to the first character of the headers buffer. // 'line_begin' points to the first character of the line. // 'current' points to a char which is ':'. // 'line_end' points to the position of '\n' + 1. // 'line_begin' points to the position of first character of line. void BalsaFrame::CleanUpKeyValueWhitespace( const char* stream_begin, const char* line_begin, const char* current, const char* line_end, HeaderLineDescription* current_header_line) { const char* colon_loc = current; DCHECK_LT(colon_loc, line_end); DCHECK_EQ(':', *colon_loc); DCHECK_EQ(':', *current); DCHECK_GE(' ', *line_end) << "\"" << std::string(line_begin, line_end) << "\""; // TODO(fenix): Investigate whether or not the bounds tests in the // while loops here are redundant, and if so, remove them. --current; while (current > line_begin && *current <= ' ') --current; current += (current != colon_loc); current_header_line->key_end_idx = current - stream_begin; current = colon_loc; DCHECK_EQ(':', *current); ++current; while (current < line_end && *current <= ' ') ++current; current_header_line->value_begin_idx = current - stream_begin; DCHECK_GE(current_header_line->key_end_idx, current_header_line->first_char_idx); DCHECK_GE(current_header_line->value_begin_idx, current_header_line->key_end_idx); DCHECK_GE(current_header_line->last_char_idx, current_header_line->value_begin_idx); } inline void BalsaFrame::FindColonsAndParseIntoKeyValue() { DCHECK(!lines_.empty()); const char* stream_begin = headers_->OriginalHeaderStreamBegin(); // The last line is always just a newline (and is uninteresting). const Lines::size_type lines_size_m1 = lines_.size() - 1; #if __SSE2__ const __v16qi colons = { ':', ':', ':', ':', ':', ':', ':', ':', ':', ':', ':', ':', ':', ':', ':', ':'}; const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16; #endif // __SSE2__ const char* current = stream_begin + lines_[1].first; // This code is a bit more subtle than it may appear at first glance. // This code looks for a colon in the current line... but it also looks // beyond the current line. If there is no colon in the current line, then // for each subsequent line (until the colon which -has- been found is // associated with a line), no searching for a colon will be performed. In // this way, we minimize the amount of bytes we have scanned for a colon. for (Lines::size_type i = 1; i < lines_size_m1;) { const char* line_begin = stream_begin + lines_[i].first; // Here we handle possible continuations. Note that we do not replace // the '\n' in the line before a continuation (at least, as of now), // which implies that any code which looks for a value must deal with // "\r\n", etc -within- the line (and not just at the end of it). for (++i; i < lines_size_m1; ++i) { const char c = *(stream_begin + lines_[i].first); if (c > ' ') { // Not a continuation, so stop. Note that if the 'original' i = 1, // and the next line is not a continuation, we'll end up with i = 2 // when we break. This handles the incrementing of i for the outer // loop. break; } } const char* line_end = stream_begin + lines_[i - 1].second; DCHECK_LT(line_begin - stream_begin, line_end - stream_begin); // We cleanup the whitespace at the end of the line before doing anything // else of interest as it allows us to do nothing when irregularly formatted // headers are parsed (e.g. those with only keys, only values, or no colon). // // We're guaranteed to have *line_end > ' ' while line_end >= line_begin. --line_end; DCHECK_EQ('\n', *line_end) << "\"" << std::string(line_begin, line_end) << "\""; while (*line_end <= ' ' && line_end > line_begin) { --line_end; } ++line_end; DCHECK_GE(' ', *line_end); DCHECK_LT(line_begin, line_end); // We use '0' for the block idx, because we're always writing to the first // block from the framer (we do this because the framer requires that the // entire header sequence be in a contiguous buffer). headers_->header_lines_.push_back( HeaderLineDescription(line_begin - stream_begin, line_end - stream_begin, line_end - stream_begin, line_end - stream_begin, 0)); if (current >= line_end) { last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; visitor_->HandleHeaderWarning(this); // Then the next colon will not be found within this header line-- time // to try again with another header-line. continue; } else if (current < line_begin) { // When this condition is true, the last detected colon was part of a // previous line. We reset to the beginning of the line as we don't care // about the presence of any colon before the beginning of the current // line. current = line_begin; } #if __SSE2__ while (current < header_lines_end_m16) { __m128i header_bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(current)); __m128i colon_cmp = _mm_cmpeq_epi8(header_bytes, reinterpret_cast<__m128i>(colons)); int colon_msk = _mm_movemask_epi8(colon_cmp); if (colon_msk == 0) { current += 16; continue; } current += (ffs(colon_msk) - 1); if (current > line_end) { break; } goto found_colon; } #endif // __SSE2__ for (; current < line_end; ++current) { if (*current != ':') { continue; } goto found_colon; } // If we've gotten to here, then there was no colon // in the line. The arguments we passed into the construction // for the HeaderLineDescription object should be OK-- it assumes // that the entire content is 'key' by default (which is true, as // there was no colon, there can be no value). Note that this is a // construct which is technically not allowed by the spec. last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; visitor_->HandleHeaderWarning(this); continue; found_colon: DCHECK_EQ(*current, ':'); DCHECK_LE(current - stream_begin, line_end - stream_begin); DCHECK_LE(stream_begin - stream_begin, current - stream_begin); HeaderLineDescription& current_header_line = headers_->header_lines_.back(); current_header_line.key_end_idx = current - stream_begin; current_header_line.value_begin_idx = current_header_line.key_end_idx; if (current < line_end) { ++current_header_line.key_end_idx; CleanUpKeyValueWhitespace(stream_begin, line_begin, current, line_end, ¤t_header_line); } } } void BalsaFrame::ProcessContentLengthLine( HeaderLines::size_type line_idx, BalsaHeadersEnums::ContentLengthStatus* status, size_t* length) { const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; const char* stream_begin = headers_->OriginalHeaderStreamBegin(); const char* line_end = stream_begin + header_line.last_char_idx; const char* value_begin = (stream_begin + header_line.value_begin_idx); if (value_begin >= line_end) { // There is no non-whitespace value data. #if DEBUGFRAMER LOG(INFO) << "invalid content-length -- no non-whitespace value data"; #endif *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; return; } *length = 0; while (value_begin < line_end) { if (*value_begin < '0' || *value_begin > '9') { // bad! content-length found, and couldn't parse all of it! *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; #if DEBUGFRAMER LOG(INFO) << "invalid content-length - non numeric character detected"; #endif // DEBUGFRAMER return; } const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10; size_t length_x_10 = *length * 10; const unsigned char c = *value_begin - '0'; if (*length > kMaxDiv10 || (std::numeric_limits<size_t>::max() - length_x_10) < c) { *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW; #if DEBUGFRAMER LOG(INFO) << "content-length overflow"; #endif // DEBUGFRAMER return; } *length = length_x_10 + c; ++value_begin; } #if DEBUGFRAMER LOG(INFO) << "content_length parsed: " << *length; #endif // DEBUGFRAMER *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH; } void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) { const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; const char* stream_begin = headers_->OriginalHeaderStreamBegin(); const char* line_end = stream_begin + header_line.last_char_idx; const char* value_begin = stream_begin + header_line.value_begin_idx; size_t value_length = line_end - value_begin; if ((value_length == 7) && !strncasecmp(value_begin, "chunked", 7)) { headers_->transfer_encoding_is_chunked_ = true; } else if ((value_length == 8) && !strncasecmp(value_begin, "identity", 8)) { headers_->transfer_encoding_is_chunked_ = false; } else { last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING; parse_state_ = BalsaFrameEnums::PARSE_ERROR; visitor_->HandleHeaderError(this); return; } } namespace { bool SplitStringPiece(base::StringPiece original, char delim, base::StringPiece* before, base::StringPiece* after) { const char* p = original.data(); const char* end = p + original.size(); while (p != end) { if (*p == delim) { ++p; } else { const char* start = p; while (++p != end && *p != delim) { // Skip to the next occurence of the delimiter. } *before = base::StringPiece(start, p - start); if (p != end) *after = base::StringPiece(p + 1, end - (p + 1)); else *after = base::StringPiece(""); StringPieceUtils::RemoveWhitespaceContext(before); StringPieceUtils::RemoveWhitespaceContext(after); return true; } } *before = original; *after = ""; return false; } // TODO(phython): Fix this function to properly deal with quoted values. // E.g. ";;foo", "\";;\"", or \"aa; // The last example, the semi-colon is a separator between extensions. void ProcessChunkExtensionsManual(base::StringPiece all_extensions, BalsaHeaders* extensions) { base::StringPiece extension; base::StringPiece remaining; StringPieceUtils::RemoveWhitespaceContext(&all_extensions); SplitStringPiece(all_extensions, ';', &extension, &remaining); while (!extension.empty()) { base::StringPiece key; base::StringPiece value; SplitStringPiece(extension, '=', &key, &value); if (!value.empty()) { // Strip quotation marks if they exist. if (!value.empty() && value[0] == '"') value.remove_prefix(1); if (!value.empty() && value[value.length() - 1] == '"') value.remove_suffix(1); } extensions->AppendHeader(key, value); StringPieceUtils::RemoveWhitespaceContext(&remaining); SplitStringPiece(remaining, ';', &extension, &remaining); } } // TODO(phython): Fix this function to properly deal with quoted values. // E.g. ";;foo", "\";;\"", or \"aa; // The last example, the semi-colon is a separator between extensions. void ProcessChunkExtensionsGoogle3(const char* input, size_t size, BalsaHeaders* extensions) { std::vector<base::StringPiece> key_values; SplitStringPieceToVector(base::StringPiece(input, size), ";", &key_values, true); for (unsigned int i = 0; i < key_values.size(); ++i) { base::StringPiece key = key_values[i].substr(0, key_values[i].find('=')); base::StringPiece value; if (key.length() < key_values[i].length()) { value = key_values[i].substr(key.length() + 1); // Remove any leading and trailing whitespace. StringPieceUtils::RemoveWhitespaceContext(&value); // Strip quotation marks if they exist. if (!value.empty() && value[0] == '"') value.remove_prefix(1); if (!value.empty() && value[value.length() - 1] == '"') value.remove_suffix(1); } // Strip the key whitespace after checking that there is a value. StringPieceUtils::RemoveWhitespaceContext(&key); extensions->AppendHeader(key, value); } } } // anonymous namespace void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size, BalsaHeaders* extensions) { #if 0 ProcessChunkExtensionsGoogle3(input, size, extensions); #else ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions); #endif } void BalsaFrame::ProcessHeaderLines() { HeaderLines::size_type content_length_idx = 0; HeaderLines::size_type transfer_encoding_idx = 0; DCHECK(!lines_.empty()); #if DEBUGFRAMER LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n"; #endif // DEBUGFRAMER // There is no need to attempt to process headers if no header lines exist. // There are at least two lines in the message which are not header lines. // These two non-header lines are the first line of the message, and the // last line of the message (which is an empty line). // Thus, we test to see if we have more than two lines total before attempting // to parse any header lines. if (lines_.size() > 2) { const char* stream_begin = headers_->OriginalHeaderStreamBegin(); // Then, for the rest of the header data, we parse these into key-value // pairs. FindColonsAndParseIntoKeyValue(); // At this point, we've parsed all of the headers. Time to look for those // headers which we require for framing. const HeaderLines::size_type header_lines_size = headers_->header_lines_.size(); for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) { const HeaderLineDescription& current_header_line = headers_->header_lines_[i]; const char* key_begin = (stream_begin + current_header_line.first_char_idx); const char* key_end = (stream_begin + current_header_line.key_end_idx); const size_t key_len = key_end - key_begin; const char c = *key_begin; #if DEBUGFRAMER LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len) << " c: '" << c << "' key_len: " << key_len; #endif // DEBUGFRAMER // If a header begins with either lowercase or uppercase 'c' or 't', then // the header may be one of content-length, connection, content-encoding // or transfer-encoding. These headers are special, as they change the way // that the message is framed, and so the framer is required to search // for them. if (c == 'c' || c == 'C') { if ((key_len == kContentLengthSize) && 0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) { BalsaHeadersEnums::ContentLengthStatus content_length_status = BalsaHeadersEnums::NO_CONTENT_LENGTH; size_t length = 0; ProcessContentLengthLine(i, &content_length_status, &length); if (content_length_idx != 0) { // then we've already seen one! if ((headers_->content_length_status_ != content_length_status) || ((headers_->content_length_status_ == BalsaHeadersEnums::VALID_CONTENT_LENGTH) && length != headers_->content_length_)) { last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS; parse_state_ = BalsaFrameEnums::PARSE_ERROR; visitor_->HandleHeaderError(this); return; } continue; } else { content_length_idx = i + 1; headers_->content_length_status_ = content_length_status; headers_->content_length_ = length; content_length_remaining_ = length; } } } else if (c == 't' || c == 'T') { if ((key_len == kTransferEncodingSize) && 0 == strncasecmp(key_begin, kTransferEncoding, kTransferEncodingSize)) { if (transfer_encoding_idx != 0) { last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS; parse_state_ = BalsaFrameEnums::PARSE_ERROR; visitor_->HandleHeaderError(this); return; } transfer_encoding_idx = i + 1; } } else if (i == 0 && (key_len == 0 || c == ' ')) { last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT; parse_state_ = BalsaFrameEnums::PARSE_ERROR; visitor_->HandleHeaderError(this); return; } } if (headers_->transfer_encoding_is_chunked_) { headers_->content_length_ = 0; headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH; content_length_remaining_ = 0; } if (transfer_encoding_idx != 0) { ProcessTransferEncodingLine(transfer_encoding_idx - 1); } } } void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() { // For responses, can't have a body if the request was a HEAD, or if it is // one of these response-codes. rfc2616 section 4.3 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; if (is_request_ || !(request_was_head_ || (headers_->parsed_response_code_ >= 100 && headers_->parsed_response_code_ < 200) || (headers_->parsed_response_code_ == 204) || (headers_->parsed_response_code_ == 304))) { // Then we can have a body. if (headers_->transfer_encoding_is_chunked_) { // Note that // if ( Transfer-Encoding: chunked && Content-length: ) // then Transfer-Encoding: chunked trumps. // This is as specified in the spec. // rfc2616 section 4.4.3 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; } else { // Errors parsing content-length definitely can cause // protocol errors/warnings switch (headers_->content_length_status_) { // If we have a content-length, and it is parsed // properly, there are two options. // 1) zero content, in which case the message is done, and // 2) nonzero content, in which case we have to // consume the body. case BalsaHeadersEnums::VALID_CONTENT_LENGTH: if (headers_->content_length_ == 0) { parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; } else { parse_state_ = BalsaFrameEnums::READING_CONTENT; } break; case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW: case BalsaHeadersEnums::INVALID_CONTENT_LENGTH: // If there were characters left-over after parsing the // content length, we should flag an error and stop. parse_state_ = BalsaFrameEnums::PARSE_ERROR; last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH; visitor_->HandleHeaderError(this); break; // We can have: no transfer-encoding, no content length, and no // connection: close... // Unfortunately, this case doesn't seem to be covered in the spec. // We'll assume that the safest thing to do here is what the google // binaries before 2008 already do, which is to assume that // everything until the connection is closed is body. case BalsaHeadersEnums::NO_CONTENT_LENGTH: if (is_request_) { base::StringPiece method = headers_->request_method(); // POSTs and PUTs should have a detectable body length. If they // do not we consider it an error. if ((method.size() == 4 && strncmp(method.data(), "POST", 4) == 0) || (method.size() == 3 && strncmp(method.data(), "PUT", 3) == 0)) { parse_state_ = BalsaFrameEnums::PARSE_ERROR; last_error_ = BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH; visitor_->HandleHeaderError(this); break; } parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; } else { parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE; last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH; visitor_->HandleHeaderWarning(this); } break; // The COV_NF_... statements here provide hints to the apparatus // which computes coverage reports/ratios that this code is never // intended to be executed, and should technically be impossible. // COV_NF_START default: LOG(FATAL) << "Saw a content_length_status: " << headers_->content_length_status_ << " which is unknown."; // COV_NF_END } } } } size_t BalsaFrame::ProcessHeaders(const char* message_start, size_t message_length) { const char* const original_message_start = message_start; const char* const message_end = message_start + message_length; const char* message_current = message_start; const char* checkpoint = message_start; if (message_length == 0) { goto bottom; } while (message_current < message_end) { size_t base_idx = headers_->GetReadableBytesFromHeaderStream(); // Yes, we could use strchr (assuming null termination), or // memchr, but as it turns out that is slower than this tight loop // for the input that we see. if (!saw_non_newline_char_) { do { const char c = *message_current; if (c != '\r' && c != '\n') { if (c <= ' ') { parse_state_ = BalsaFrameEnums::PARSE_ERROR; last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST; visitor_->HandleHeaderError(this); goto bottom; } else { saw_non_newline_char_ = true; checkpoint = message_start = message_current; goto read_real_message; } } ++message_current; } while (message_current < message_end); goto bottom; // this is necessary to skip 'last_char_was_slash_r' checks } else { read_real_message: // Note that SSE2 can be enabled on certain piii platforms. #if __SSE2__ { const char* const message_end_m16 = message_end - 16; __v16qi newlines = { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' }; while (message_current < message_end_m16) { // What this does (using compiler intrinsics): // // Load 16 '\n's into an xmm register // Load 16 bytes of currennt message into an xmm register // Do byte-wise equals on those two xmm registers // Take the first bit of each byte, and put that into the first // 16 bits of a mask // If the mask is zero, no '\n' found. increment by 16 and try again // Else scan forward to find the first set bit. // Increment current by the index of the first set bit // (ffs returns index of first set bit + 1) __m128i msg_bytes = _mm_loadu_si128(const_cast<__m128i *>( reinterpret_cast<const __m128i *>(message_current))); __m128i newline_cmp = _mm_cmpeq_epi8(msg_bytes, reinterpret_cast<__m128i>(newlines)); int newline_msk = _mm_movemask_epi8(newline_cmp); if (newline_msk == 0) { message_current += 16; continue; } message_current += (ffs(newline_msk) - 1); const size_t relative_idx = message_current - message_start; const size_t message_current_idx = 1 + base_idx + relative_idx; lines_.push_back(std::make_pair(last_slash_n_idx_, message_current_idx)); if (lines_.size() == 1) { headers_->WriteFromFramer(checkpoint, 1 + message_current - checkpoint); checkpoint = message_current + 1; const char* begin = headers_->OriginalHeaderStreamBegin(); #if DEBUGFRAMER LOG(INFO) << "First line " << std::string(begin, lines_[0].second); LOG(INFO) << "is_request_: " << is_request_; #endif ProcessFirstLine(begin, begin + lines_[0].second); if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) goto process_lines; else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) goto bottom; } const size_t chars_since_last_slash_n = (message_current_idx - last_slash_n_idx_); last_slash_n_idx_ = message_current_idx; if (chars_since_last_slash_n > 2) { // We have a slash-n, but the last slash n was // more than 2 characters away from this. Thus, we know // that this cannot be an end-of-header. ++message_current; continue; } if ((chars_since_last_slash_n == 1) || (((message_current > message_start) && (*(message_current - 1) == '\r')) || (last_char_was_slash_r_))) { goto process_lines; } ++message_current; } } #endif // __SSE2__ while (message_current < message_end) { if (*message_current != '\n') { ++message_current; continue; } const size_t relative_idx = message_current - message_start; const size_t message_current_idx = 1 + base_idx + relative_idx; lines_.push_back(std::make_pair(last_slash_n_idx_, message_current_idx)); if (lines_.size() == 1) { headers_->WriteFromFramer(checkpoint, 1 + message_current - checkpoint); checkpoint = message_current + 1; const char* begin = headers_->OriginalHeaderStreamBegin(); #if DEBUGFRAMER LOG(INFO) << "First line " << std::string(begin, lines_[0].second); LOG(INFO) << "is_request_: " << is_request_; #endif ProcessFirstLine(begin, begin + lines_[0].second); if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) goto process_lines; else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) goto bottom; } const size_t chars_since_last_slash_n = (message_current_idx - last_slash_n_idx_); last_slash_n_idx_ = message_current_idx; if (chars_since_last_slash_n > 2) { // false positive. ++message_current; continue; } if ((chars_since_last_slash_n == 1) || (((message_current > message_start) && (*(message_current - 1) == '\r')) || (last_char_was_slash_r_))) { goto process_lines; } ++message_current; } } continue; process_lines: ++message_current; DCHECK(message_current >= message_start); if (message_current > message_start) { headers_->WriteFromFramer(checkpoint, message_current - checkpoint); } // Check if we have exceeded maximum headers length // Although we check for this limit before and after we call this function // we check it here as well to make sure that in case the visitor changed // the max_header_length_ (for example after processing the first line) // we handle it gracefully. if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) { parse_state_ = BalsaFrameEnums::PARSE_ERROR; last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; visitor_->HandleHeaderError(this); goto bottom; } // Since we know that we won't be writing any more bytes of the header, // we tell that to the headers object. The headers object may make // more efficient allocation decisions when this is signaled. headers_->DoneWritingFromFramer(); { const char* readable_ptr = NULL; size_t readable_size = 0; headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size); visitor_->ProcessHeaderInput(readable_ptr, readable_size); } // Ok, now that we've written everything into our header buffer, it is // time to process the header lines (extract proper values for headers // which are important for framing). ProcessHeaderLines(); if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) { goto bottom; } AssignParseStateAfterHeadersHaveBeenParsed(); if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) { goto bottom; } visitor_->ProcessHeaders(*headers_); visitor_->HeaderDone(); if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) { visitor_->MessageDone(); } goto bottom; } // If we've gotten to here, it means that we've consumed all of the // available input. We need to record whether or not the last character we // saw was a '\r' so that a subsequent call to ProcessInput correctly finds // a header framing that is split across the two calls. last_char_was_slash_r_ = (*(message_end - 1) == '\r'); DCHECK(message_current >= message_start); if (message_current > message_start) { headers_->WriteFromFramer(checkpoint, message_current - checkpoint); } bottom: return message_current - original_message_start; } size_t BalsaFrame::BytesSafeToSplice() const { switch (parse_state_) { case BalsaFrameEnums::READING_CHUNK_DATA: return chunk_length_remaining_; case BalsaFrameEnums::READING_UNTIL_CLOSE: return std::numeric_limits<size_t>::max(); case BalsaFrameEnums::READING_CONTENT: return content_length_remaining_; default: return 0; } } void BalsaFrame::BytesSpliced(size_t bytes_spliced) { switch (parse_state_) { case BalsaFrameEnums::READING_CHUNK_DATA: if (chunk_length_remaining_ >= bytes_spliced) { chunk_length_remaining_ -= bytes_spliced; if (chunk_length_remaining_ == 0) { parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; } return; } else { last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; goto error_exit; } case BalsaFrameEnums::READING_UNTIL_CLOSE: return; case BalsaFrameEnums::READING_CONTENT: if (content_length_remaining_ >= bytes_spliced) { content_length_remaining_ -= bytes_spliced; if (content_length_remaining_ == 0) { parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; visitor_->MessageDone(); } return; } else { last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; goto error_exit; } default: last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO; goto error_exit; } error_exit: parse_state_ = BalsaFrameEnums::PARSE_ERROR; visitor_->HandleBodyError(this); }; // You may note that the state-machine contained within this function has both // switch and goto labels for nearly the same thing. For instance, the // following two labels refer to the same code block: // label_reading_chunk_data: // case BalsaFrameEnums::READING_CHUNK_DATA: // The 'case' statement is required for the switch statement which occurs when // ProcessInput is invoked. The goto label is required as the state-machine // does not use a computed goto in any subsequent operations. // // Since several states exit the state machine for various reasons, there is // also one label at the bottom of the function. When it is appropriate to // return from the function, that part of the state machine instead issues a // goto bottom; This results in less code duplication, and makes debugging // easier (as you can add a statement to a section of code which is guaranteed // to be invoked when the function is exiting. size_t BalsaFrame::ProcessInput(const char* input, size_t size) { const char* current = input; const char* on_entry = current; const char* end = current + size; #if DEBUGFRAMER LOG(INFO) << "\n==============" << BalsaFrameEnums::ParseStateToString(parse_state_) << "===============\n"; #endif // DEBUGFRAMER DCHECK(headers_ != NULL); if (headers_ == NULL) return 0; if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) { const size_t header_length = headers_->GetReadableBytesFromHeaderStream(); // Yes, we still have to check this here as the user can change the // max_header_length amount! // Also it is possible that we have reached the maximum allowed header size, // and we have more to consume (remember we are still inside // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error. if (header_length > max_header_length_ || (header_length == max_header_length_ && size > 0)) { parse_state_ = BalsaFrameEnums::PARSE_ERROR; last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; visitor_->HandleHeaderError(this); goto bottom; } size_t bytes_to_process = max_header_length_ - header_length; if (bytes_to_process > size) { bytes_to_process = size; } current += ProcessHeaders(input, bytes_to_process); // If we are still reading headers check if we have crossed the headers // limit. Note that we check for >= as opposed to >. This is because if // header_length_after equals max_header_length_ and we are still in the // parse_state_ BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for // sure that the headers limit will be crossed later on if ((parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE)) { // Note that headers_ is valid only if we are still reading headers. const size_t header_length_after = headers_->GetReadableBytesFromHeaderStream(); if (header_length_after >= max_header_length_) { parse_state_ = BalsaFrameEnums::PARSE_ERROR; last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; visitor_->HandleHeaderError(this); } } goto bottom; } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ || parse_state_ == BalsaFrameEnums::PARSE_ERROR) { // Can do nothing more 'till we're reset. goto bottom; } while (current < end) { switch (parse_state_) { label_reading_chunk_length: case BalsaFrameEnums::READING_CHUNK_LENGTH: // In this state we read the chunk length. // Note that once we hit a character which is not in: // [0-9;A-Fa-f\n], we transition to a different state. // { // If we used strtol, etc, we'd have to buffer this line. // This is more annoying than simply doing the conversion // here. This code accounts for overflow. static const signed char buf[] = { // %0 %1 %2 %3 %4 %5 %6 %7 %8 \t \n %b %c \r %e %f -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1, // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2, -1, -1, -1, -1, // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, }; // valid cases: // "09123\n" // -> 09123 // "09123\r\n" // -> 09123 // "09123 \n" // -> 09123 // "09123 \r\n" // -> 09123 // "09123 12312\n" // -> 09123 // "09123 12312\r\n" // -> 09123 // "09123; foo=bar\n" // -> 09123 // "09123; foo=bar\r\n" // -> 09123 // "FFFFFFFFFFFFFFFF\r\n" // -> FFFFFFFFFFFFFFFF // "FFFFFFFFFFFFFFFF 22\r\n" // -> FFFFFFFFFFFFFFFF // invalid cases: // "[ \t]+[^\n]*\n" // "FFFFFFFFFFFFFFFFF\r\n" (would overflow) // "\r\n" // "\n" while (current < end) { const char c = *current; ++current; const signed char addition = buf[static_cast<int>(c)]; if (addition >= 0) { chunk_length_character_extracted_ = true; size_t length_x_16 = chunk_length_remaining_ * 16; const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16; if ((chunk_length_remaining_ > kMaxDiv16) || ((std::numeric_limits<size_t>::max() - length_x_16) < static_cast<size_t>(addition))) { // overflow -- asked for a chunk-length greater than 2^64 - 1!! parse_state_ = BalsaFrameEnums::PARSE_ERROR; last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW; visitor_->ProcessBodyInput(on_entry, current - on_entry); visitor_->HandleChunkingError(this); goto bottom; } chunk_length_remaining_ = length_x_16 + addition; continue; } if (!chunk_length_character_extracted_ || addition == -1) { // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no // characters were converted, or an unexpected character was // seen. parse_state_ = BalsaFrameEnums::PARSE_ERROR; last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH; visitor_->ProcessBodyInput(on_entry, current - on_entry); visitor_->HandleChunkingError(this); goto bottom; } --current; parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION; visitor_->ProcessChunkLength(chunk_length_remaining_); goto label_reading_chunk_extension; } } visitor_->ProcessBodyInput(on_entry, current - on_entry); goto bottom; // case BalsaFrameEnums::READING_CHUNK_LENGTH label_reading_chunk_extension: case BalsaFrameEnums::READING_CHUNK_EXTENSION: { // TODO(phython): Convert this scanning to be 16 bytes at a time if // there is data to be read. const char* extensions_start = current; size_t extensions_length = 0; while (current < end) { const char c = *current; if (c == '\r' || c == '\n') { extensions_length = (extensions_start == current) ? 0 : current - extensions_start - 1; } ++current; if (c == '\n') { chunk_length_character_extracted_ = false; visitor_->ProcessChunkExtensions( extensions_start, extensions_length); if (chunk_length_remaining_ != 0) { parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA; goto label_reading_chunk_data; } HeaderFramingFound('\n'); parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM; goto label_reading_last_chunk_term; } } visitor_->ProcessChunkExtensions( extensions_start, extensions_length); } visitor_->ProcessBodyInput(on_entry, current - on_entry); goto bottom; // case BalsaFrameEnums::READING_CHUNK_EXTENSION label_reading_chunk_data: case BalsaFrameEnums::READING_CHUNK_DATA: while (current < end) { if (chunk_length_remaining_ == 0) { break; } // read in the chunk size_t bytes_remaining = end - current; size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ? chunk_length_remaining_ : bytes_remaining; const char* tmp_current = current + consumed_bytes; visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry); visitor_->ProcessBodyData(current, consumed_bytes); on_entry = current = tmp_current; chunk_length_remaining_ -= consumed_bytes; } if (chunk_length_remaining_ == 0) { parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; goto label_reading_chunk_term; } visitor_->ProcessBodyInput(on_entry, current - on_entry); goto bottom; // case BalsaFrameEnums::READING_CHUNK_DATA label_reading_chunk_term: case BalsaFrameEnums::READING_CHUNK_TERM: while (current < end) { const char c = *current; ++current; if (c == '\n') { parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; goto label_reading_chunk_length; } } visitor_->ProcessBodyInput(on_entry, current - on_entry); goto bottom; // case BalsaFrameEnums::READING_CHUNK_TERM label_reading_last_chunk_term: case BalsaFrameEnums::READING_LAST_CHUNK_TERM: while (current < end) { const char c = *current; if (!HeaderFramingFound(c)) { // If not, however, since the spec only suggests that the // client SHOULD indicate the presence of trailers, we get to // *test* that they did or didn't. // If all of the bytes we've seen since: // OPTIONAL_WS 0 OPTIONAL_STUFF CRLF // are either '\r', or '\n', then we can assume that we don't yet // know if we need to parse headers, or if the next byte will make // the HeaderFramingFound condition (above) true. if (HeaderFramingMayBeFound()) { // If true, then we have seen only characters '\r' or '\n'. ++current; // Lets try again! There is no state change here. continue; } else { // If (!HeaderFramingMayBeFound()), then we know that we must be // reading the first non CRLF character of a trailer. parse_state_ = BalsaFrameEnums::READING_TRAILER; visitor_->ProcessBodyInput(on_entry, current - on_entry); on_entry = current; goto label_reading_trailer; } } else { // If we've found a "\r\n\r\n", then the message // is done. ++current; parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; visitor_->ProcessBodyInput(on_entry, current - on_entry); visitor_->MessageDone(); goto bottom; } break; // from while loop } visitor_->ProcessBodyInput(on_entry, current - on_entry); goto bottom; // case BalsaFrameEnums::READING_LAST_CHUNK_TERM label_reading_trailer: case BalsaFrameEnums::READING_TRAILER: while (current < end) { const char c = *current; ++current; // TODO(fenix): If we ever care about trailers as part of framing, // deal with them here (see below for part of the 'solution') // if (LineFramingFound(c)) { // trailer_lines_.push_back(make_pair(start_of_line_, // trailer_length_ - 1)); // start_of_line_ = trailer_length_; // } if (HeaderFramingFound(c)) { // ProcessTrailers(visitor_, &trailers_); parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; visitor_->ProcessTrailerInput(on_entry, current - on_entry); visitor_->MessageDone(); goto bottom; } } visitor_->ProcessTrailerInput(on_entry, current - on_entry); break; // case BalsaFrameEnums::READING_TRAILER // Note that there is no label: // 'label_reading_until_close' // here. This is because the state-machine exists immediately after // reading the headers instead of transitioning here (as it would // do if it was consuming all the data it could, all the time). case BalsaFrameEnums::READING_UNTIL_CLOSE: { const size_t bytes_remaining = end - current; if (bytes_remaining > 0) { visitor_->ProcessBodyInput(current, bytes_remaining); visitor_->ProcessBodyData(current, bytes_remaining); current += bytes_remaining; } } goto bottom; // case BalsaFrameEnums::READING_UNTIL_CLOSE // label_reading_content: case BalsaFrameEnums::READING_CONTENT: #if DEBUGFRAMER LOG(INFO) << "ReadingContent: " << content_length_remaining_; #endif // DEBUGFRAMER while (content_length_remaining_ && current < end) { // read in the content const size_t bytes_remaining = end - current; const size_t consumed_bytes = (content_length_remaining_ < bytes_remaining) ? content_length_remaining_ : bytes_remaining; visitor_->ProcessBodyInput(current, consumed_bytes); visitor_->ProcessBodyData(current, consumed_bytes); current += consumed_bytes; content_length_remaining_ -= consumed_bytes; } if (content_length_remaining_ == 0) { parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; visitor_->MessageDone(); } goto bottom; // case BalsaFrameEnums::READING_CONTENT default: // The state-machine should never be in a state that isn't handled // above. This is a glaring logic error, and we should do something // drastic to ensure that this gets looked-at and fixed. LOG(FATAL) << "Unknown state: " << parse_state_ // COV_NF_LINE << " memory corruption?!"; // COV_NF_LINE } } bottom: #if DEBUGFRAMER LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n" << std::string(input, current) << "\n$$$$$$$$$$$$$$" << BalsaFrameEnums::ParseStateToString(parse_state_) << "$$$$$$$$$$$$$$$" << " consumed: " << (current - input); if (Error()) { LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode()); } #endif // DEBUGFRAMER return current - input; } const uint32 BalsaFrame::kValidTerm1; const uint32 BalsaFrame::kValidTerm1Mask; const uint32 BalsaFrame::kValidTerm2; const uint32 BalsaFrame::kValidTerm2Mask; } // namespace net