/*
* Copyright (C) 2004, 2007, 2008 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#if !USE(GOOGLEURL)
#include "KURL.h"
#include "CString.h"
#include "StringHash.h"
#include "TextEncoding.h"
#include <wtf/HashMap.h>
#include <wtf/StdLibExtras.h>
#if USE(ICU_UNICODE)
#include <unicode/uidna.h>
#elif USE(QT4_UNICODE)
#include <QUrl>
#elif USE(GLIB_UNICODE)
#include <glib.h>
#include <wtf/gtk/GOwnPtr.h>
#endif
#include <stdio.h>
using namespace std;
using namespace WTF;
namespace WebCore {
typedef Vector<char, 512> CharBuffer;
typedef Vector<UChar, 512> UCharBuffer;
// FIXME: This file makes too much use of the + operator on String.
// We either have to optimize that operator so it doesn't involve
// so many allocations, or change this to use Vector<UChar> instead.
enum URLCharacterClasses {
// alpha
SchemeFirstChar = 1 << 0,
// ( alpha | digit | "+" | "-" | "." )
SchemeChar = 1 << 1,
// mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
// unreserved = alphanum | mark
// ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," )
UserInfoChar = 1 << 2,
// alnum | "." | "-" | "%"
// The above is what the specification says, but we are lenient to
// match existing practice and also allow:
// "_"
HostnameChar = 1 << 3,
// hexdigit | ":" | "%"
IPv6Char = 1 << 4,
// "#" | "?" | "/" | nul
PathSegmentEndChar = 1 << 5,
// not allowed in path
BadChar = 1 << 6
};
static const char hexDigits[17] = "0123456789ABCDEF";
static const unsigned char characterClassTable[256] = {
/* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar,
/* 2 stx */ BadChar, /* 3 etx */ BadChar,
/* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar,
/* 8 bs */ BadChar, /* 9 ht */ BadChar, /* 10 nl */ BadChar, /* 11 vt */ BadChar,
/* 12 np */ BadChar, /* 13 cr */ BadChar, /* 14 so */ BadChar, /* 15 si */ BadChar,
/* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar,
/* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar,
/* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar,
/* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar,
/* 32 sp */ BadChar, /* 33 ! */ UserInfoChar,
/* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar,
/* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar,
/* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar,
/* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar,
/* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar,
/* 44 , */ UserInfoChar,
/* 45 - */ SchemeChar | UserInfoChar | HostnameChar,
/* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 47 / */ PathSegmentEndChar,
/* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar,
/* 60 < */ BadChar, /* 61 = */ UserInfoChar,
/* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar,
/* 64 @ */ 0,
/* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 91 [ */ 0,
/* 92 \ */ 0, /* 93 ] */ 0,
/* 94 ^ */ 0,
/* 95 _ */ UserInfoChar | HostnameChar,
/* 96 ` */ 0,
/* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 123 { */ 0,
/* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar,
/* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar,
/* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar,
/* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar,
/* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar,
/* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar,
/* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar,
/* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar,
/* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar,
/* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar,
/* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar,
/* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar,
/* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar,
/* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar,
/* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar,
/* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar,
/* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar,
/* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar,
/* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar,
/* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar,
/* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar,
/* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar,
/* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar,
/* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar,
/* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar,
/* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar,
/* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar,
/* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar,
/* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar,
/* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar,
/* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar,
/* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar,
/* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar
};
static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd);
static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput);
static String substituteBackslashes(const String&);
static bool isValidProtocol(const String&);
static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; }
static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); }
static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; }
static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); }
static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; }
static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; }
static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; }
static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; }
static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); }
static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; }
static inline int hexDigitValue(UChar c)
{
ASSERT(isASCIIHexDigit(c));
if (c < 'A')
return c - '0';
return (c - 'A' + 10) & 0xF; // handle both upper and lower case without a branch
}
// Copies the source to the destination, assuming all the source characters are
// ASCII. The destination buffer must be large enough. Null characters are allowed
// in the source string, and no attempt is made to null-terminate the result.
static void copyASCII(const UChar* src, int length, char* dest)
{
for (int i = 0; i < length; i++)
dest[i] = static_cast<char>(src[i]);
}
static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer)
{
buffer.resize(base.length() + len + 1);
copyASCII(base.characters(), base.length(), buffer.data());
memcpy(buffer.data() + base.length(), rel, len);
buffer[buffer.size() - 1] = '\0';
}
// FIXME: Move to PlatformString.h eventually.
// Returns the index of the first index in string |s| of any of the characters
// in |toFind|. |toFind| should be a null-terminated string, all characters up
// to the null will be searched. Returns int if not found.
static int findFirstOf(const UChar* s, int sLen, int startPos, const char* toFind)
{
for (int i = startPos; i < sLen; i++) {
const char* cur = toFind;
while (*cur) {
if (s[i] == *(cur++))
return i;
}
}
return -1;
}
#ifndef NDEBUG
static void checkEncodedString(const String& url)
{
for (unsigned i = 0; i < url.length(); ++i)
ASSERT(!(url[i] & ~0x7F));
ASSERT(!url.length() || isSchemeFirstChar(url[0]));
}
#else
static inline void checkEncodedString(const String&)
{
}
#endif
inline bool KURL::protocolIs(const String& string, const char* protocol)
{
return WebCore::protocolIs(string, protocol);
}
void KURL::invalidate()
{
m_isValid = false;
m_protocolInHTTPFamily = false;
m_schemeEnd = 0;
m_userStart = 0;
m_userEnd = 0;
m_passwordEnd = 0;
m_hostEnd = 0;
m_portEnd = 0;
m_pathEnd = 0;
m_pathAfterLastSlash = 0;
m_queryEnd = 0;
m_fragmentEnd = 0;
}
KURL::KURL(ParsedURLStringTag, const char* url)
{
parse(url, 0);
ASSERT(url == m_string);
}
KURL::KURL(ParsedURLStringTag, const String& url)
{
parse(url);
ASSERT(url == m_string);
}
KURL::KURL(const KURL& base, const String& relative)
{
init(base, relative, UTF8Encoding());
}
KURL::KURL(const KURL& base, const String& relative, const TextEncoding& encoding)
{
// For UTF-{7,16,32}, we want to use UTF-8 for the query part as
// we do when submitting a form. A form with GET method
// has its contents added to a URL as query params and it makes sense
// to be consistent.
init(base, relative, encoding.encodingForFormSubmission());
}
void KURL::init(const KURL& base, const String& relative, const TextEncoding& encoding)
{
// Allow resolutions with a null or empty base URL, but not with any other invalid one.
// FIXME: Is this a good rule?
if (!base.m_isValid && !base.isEmpty()) {
m_string = relative;
invalidate();
return;
}
// For compatibility with Win IE, treat backslashes as if they were slashes,
// as long as we're not dealing with javascript: or data: URLs.
String rel = relative;
if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data")))
rel = substituteBackslashes(rel);
String* originalString = &rel;
bool allASCII = charactersAreAllASCII(rel.characters(), rel.length());
CharBuffer strBuffer;
char* str;
size_t len;
if (allASCII) {
len = rel.length();
strBuffer.resize(len + 1);
copyASCII(rel.characters(), len, strBuffer.data());
strBuffer[len] = 0;
str = strBuffer.data();
} else {
originalString = 0;
encodeRelativeString(rel, encoding, strBuffer);
str = strBuffer.data();
len = strlen(str);
}
// Get rid of leading whitespace.
while (*str == ' ') {
originalString = 0;
str++;
--len;
}
// Get rid of trailing whitespace.
while (len && str[len - 1] == ' ') {
originalString = 0;
str[--len] = '\0';
}
// According to the RFC, the reference should be interpreted as an
// absolute URI if possible, using the "leftmost, longest"
// algorithm. If the URI reference is absolute it will have a
// scheme, meaning that it will have a colon before the first
// non-scheme element.
bool absolute = false;
char* p = str;
if (isSchemeFirstChar(*p)) {
++p;
while (isSchemeChar(*p)) {
++p;
}
if (*p == ':') {
if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical()) {
str = p + 1;
originalString = 0;
} else
absolute = true;
}
}
CharBuffer parseBuffer;
if (absolute) {
parse(str, originalString);
} else {
// If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid
// unless the relative URL is a single fragment.
if (!base.isHierarchical()) {
if (str[0] == '#') {
appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
parse(parseBuffer.data(), 0);
} else {
m_string = relative;
invalidate();
}
return;
}
switch (str[0]) {
case '\0':
// The reference is empty, so this is a reference to the same document with any fragment identifier removed.
*this = base;
removeFragmentIdentifier();
break;
case '#': {
// must be fragment-only reference
appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
parse(parseBuffer.data(), 0);
break;
}
case '?': {
// query-only reference, special case needed for non-URL results
appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer);
parse(parseBuffer.data(), 0);
break;
}
case '/':
// must be net-path or absolute-path reference
if (str[1] == '/') {
// net-path
appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer);
parse(parseBuffer.data(), 0);
} else {
// abs-path
appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer);
parse(parseBuffer.data(), 0);
}
break;
default:
{
// must be relative-path reference
// Base part plus relative part plus one possible slash added in between plus terminating \0 byte.
parseBuffer.resize(base.m_pathEnd + 1 + len + 1);
char* bufferPos = parseBuffer.data();
// first copy everything before the path from the base
unsigned baseLength = base.m_string.length();
const UChar* baseCharacters = base.m_string.characters();
CharBuffer baseStringBuffer(baseLength);
copyASCII(baseCharacters, baseLength, baseStringBuffer.data());
const char* baseString = baseStringBuffer.data();
const char* baseStringStart = baseString;
const char* pathStart = baseStringStart + base.m_portEnd;
while (baseStringStart < pathStart)
*bufferPos++ = *baseStringStart++;
char* bufferPathStart = bufferPos;
// now copy the base path
const char* baseStringEnd = baseString + base.m_pathEnd;
// go back to the last slash
while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/')
baseStringEnd--;
if (baseStringEnd == baseStringStart) {
// no path in base, add a path separator if necessary
if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#')
*bufferPos++ = '/';
} else {
bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart);
}
const char* relStringStart = str;
const char* relStringPos = relStringStart;
while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') {
if (relStringPos[0] == '.' && bufferPos[-1] == '/') {
if (isPathSegmentEndChar(relStringPos[1])) {
// skip over "." segment
relStringPos += 1;
if (relStringPos[0] == '/')
relStringPos++;
continue;
} else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) {
// skip over ".." segment and rewind the last segment
// the RFC leaves it up to the app to decide what to do with excess
// ".." segments - we choose to drop them since some web content
// relies on this.
relStringPos += 2;
if (relStringPos[0] == '/')
relStringPos++;
if (bufferPos > bufferPathStart + 1)
bufferPos--;
while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/')
bufferPos--;
continue;
}
}
*bufferPos = *relStringPos;
relStringPos++;
bufferPos++;
}
// all done with the path work, now copy any remainder
// of the relative reference; this will also add a null terminator
strcpy(bufferPos, relStringPos);
parse(parseBuffer.data(), 0);
ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size());
break;
}
}
}
}
KURL KURL::copy() const
{
KURL result = *this;
result.m_string = result.m_string.crossThreadString();
return result;
}
bool KURL::hasPath() const
{
return m_pathEnd != m_portEnd;
}
String KURL::lastPathComponent() const
{
if (!hasPath())
return String();
int end = m_pathEnd - 1;
if (m_string[end] == '/')
--end;
int start = m_string.reverseFind('/', end);
if (start < m_portEnd)
return String();
++start;
return m_string.substring(start, end - start + 1);
}
String KURL::protocol() const
{
return m_string.left(m_schemeEnd);
}
String KURL::host() const
{
int start = hostStart();
return decodeURLEscapeSequences(m_string.substring(start, m_hostEnd - start));
}
unsigned short KURL::port() const
{
if (m_hostEnd == m_portEnd)
return 0;
int number = m_string.substring(m_hostEnd + 1, m_portEnd - m_hostEnd - 1).toInt();
if (number < 0 || number > 0xFFFF)
return 0;
return number;
}
String KURL::pass() const
{
if (m_passwordEnd == m_userEnd)
return String();
return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1));
}
String KURL::user() const
{
return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart));
}
String KURL::fragmentIdentifier() const
{
if (m_fragmentEnd == m_queryEnd)
return String();
return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1));
}
bool KURL::hasFragmentIdentifier() const
{
return m_fragmentEnd != m_queryEnd;
}
String KURL::baseAsString() const
{
return m_string.left(m_pathAfterLastSlash);
}
#ifdef NDEBUG
static inline void assertProtocolIsGood(const char*)
{
}
#else
static void assertProtocolIsGood(const char* protocol)
{
const char* p = protocol;
while (*p) {
ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
++p;
}
}
#endif
bool KURL::protocolIs(const char* protocol) const
{
assertProtocolIsGood(protocol);
// JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid.
// The free function protocolIsJavaScript() should be used instead.
ASSERT(!equalIgnoringCase(protocol, String("javascript")));
if (!m_isValid)
return false;
// Do the comparison without making a new string object.
for (int i = 0; i < m_schemeEnd; ++i) {
if (!protocol[i] || toASCIILower(m_string[i]) != protocol[i])
return false;
}
return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
}
String KURL::query() const
{
if (m_queryEnd == m_pathEnd)
return String();
return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
}
String KURL::path() const
{
return decodeURLEscapeSequences(m_string.substring(m_portEnd, m_pathEnd - m_portEnd));
}
bool KURL::setProtocol(const String& s)
{
// Firefox and IE remove everything after the first ':'.
int separatorPosition = s.find(':');
String newProtocol = s.substring(0, separatorPosition);
if (!isValidProtocol(newProtocol))
return false;
if (!m_isValid) {
parse(newProtocol + ":" + m_string);
return true;
}
parse(newProtocol + m_string.substring(m_schemeEnd));
return true;
}
void KURL::setHost(const String& s)
{
if (!m_isValid)
return;
// FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
// and to avoid changing more than just the host.
bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd));
}
void KURL::removePort()
{
if (m_hostEnd == m_portEnd)
return;
parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
}
void KURL::setPort(unsigned short i)
{
if (!m_isValid)
return;
bool colonNeeded = m_portEnd == m_hostEnd;
int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1);
parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd));
}
void KURL::setHostAndPort(const String& hostAndPort)
{
if (!m_isValid)
return;
// FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
// and to avoid changing more than just host and port.
bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd));
}
void KURL::setUser(const String& user)
{
if (!m_isValid)
return;
// FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
// and to avoid changing more than just the user login.
String u;
int end = m_userEnd;
if (!user.isEmpty()) {
u = user;
if (m_userStart == m_schemeEnd + 1)
u = "//" + u;
// Add '@' if we didn't have one before.
if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@'))
u.append('@');
} else {
// Remove '@' if we now have neither user nor password.
if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
end += 1;
}
parse(m_string.left(m_userStart) + u + m_string.substring(end));
}
void KURL::setPass(const String& password)
{
if (!m_isValid)
return;
// FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
// and to avoid changing more than just the user password.
String p;
int end = m_passwordEnd;
if (!password.isEmpty()) {
p = ":" + password + "@";
if (m_userEnd == m_schemeEnd + 1)
p = "//" + p;
// Eat the existing '@' since we are going to add our own.
if (end != m_hostEnd && m_string[end] == '@')
end += 1;
} else {
// Remove '@' if we now have neither user nor password.
if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@')
end += 1;
}
parse(m_string.left(m_userEnd) + p + m_string.substring(end));
}
void KURL::setFragmentIdentifier(const String& s)
{
if (!m_isValid)
return;
// FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations.
parse(m_string.left(m_queryEnd) + "#" + s);
}
void KURL::removeFragmentIdentifier()
{
if (!m_isValid)
return;
parse(m_string.left(m_queryEnd));
}
void KURL::setQuery(const String& query)
{
if (!m_isValid)
return;
// FIXME: '#' and non-ASCII characters must be encoded and escaped.
// Usually, the query is encoded using document encoding, not UTF-8, but we don't have
// access to the document in this function.
if ((query.isEmpty() || query[0] != '?') && !query.isNull())
parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd));
else
parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd));
}
void KURL::setPath(const String& s)
{
if (!m_isValid)
return;
// FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts
// may be inadvertently affected.
parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(s) + m_string.substring(m_pathEnd));
}
String KURL::prettyURL() const
{
if (!m_isValid)
return m_string;
Vector<UChar> result;
append(result, protocol());
result.append(':');
Vector<UChar> authority;
if (m_hostEnd != m_passwordEnd) {
if (m_userEnd != m_userStart) {
append(authority, user());
authority.append('@');
}
append(authority, host());
if (hasPort()) {
authority.append(':');
append(authority, String::number(port()));
}
}
if (!authority.isEmpty()) {
result.append('/');
result.append('/');
result.append(authority);
} else if (protocolIs("file")) {
result.append('/');
result.append('/');
}
append(result, path());
if (m_pathEnd != m_queryEnd) {
result.append('?');
append(result, query());
}
if (m_fragmentEnd != m_queryEnd) {
result.append('#');
append(result, fragmentIdentifier());
}
return String::adopt(result);
}
String decodeURLEscapeSequences(const String& str)
{
return decodeURLEscapeSequences(str, UTF8Encoding());
}
String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding)
{
Vector<UChar> result;
CharBuffer buffer;
int length = str.length();
int decodedPosition = 0;
int searchPosition = 0;
int encodedRunPosition;
while ((encodedRunPosition = str.find('%', searchPosition)) >= 0) {
// Find the sequence of %-escape codes.
int encodedRunEnd = encodedRunPosition;
while (length - encodedRunEnd >= 3
&& str[encodedRunEnd] == '%'
&& isASCIIHexDigit(str[encodedRunEnd + 1])
&& isASCIIHexDigit(str[encodedRunEnd + 2]))
encodedRunEnd += 3;
if (encodedRunEnd == encodedRunPosition) {
++searchPosition;
continue;
}
searchPosition = encodedRunEnd;
// Decode the %-escapes into bytes.
unsigned runLength = (encodedRunEnd - encodedRunPosition) / 3;
buffer.resize(runLength);
char* p = buffer.data();
const UChar* q = str.characters() + encodedRunPosition;
for (unsigned i = 0; i < runLength; ++i) {
*p++ = (hexDigitValue(q[1]) << 4) | hexDigitValue(q[2]);
q += 3;
}
// Decode the bytes into Unicode characters.
String decoded = (encoding.isValid() ? encoding : UTF8Encoding()).decode(buffer.data(), p - buffer.data());
if (decoded.isEmpty())
continue;
// Build up the string with what we just skipped and what we just decoded.
result.append(str.characters() + decodedPosition, encodedRunPosition - decodedPosition);
result.append(decoded.characters(), decoded.length());
decodedPosition = encodedRunEnd;
}
result.append(str.characters() + decodedPosition, length - decodedPosition);
return String::adopt(result);
}
bool KURL::isLocalFile() const
{
// Including feed here might be a bad idea since drag and drop uses this check
// and including feed would allow feeds to potentially let someone's blog
// read the contents of the clipboard on a drag, even without a drop.
// Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
return protocolIs("file");
}
static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length)
{
char* p = buffer;
const char* str = strStart;
const char* strEnd = strStart + length;
while (str < strEnd) {
unsigned char c = *str++;
if (isBadChar(c)) {
if (c == '%' || c == '?') {
*p++ = c;
} else if (c != 0x09 && c != 0x0a && c != 0x0d) {
*p++ = '%';
*p++ = hexDigits[c >> 4];
*p++ = hexDigits[c & 0xF];
}
} else {
*p++ = c;
}
}
buffer = p;
}
// copy a path, accounting for "." and ".." segments
static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd)
{
char* bufferPathStart = dst;
// empty path is a special case, and need not have a leading slash
if (srcStart != srcEnd) {
const char* baseStringStart = src + srcStart;
const char* baseStringEnd = src + srcEnd;
const char* baseStringPos = baseStringStart;
// this code is unprepared for paths that do not begin with a
// slash and we should always have one in the source string
ASSERT(baseStringPos[0] == '/');
// copy the leading slash into the destination
*dst = *baseStringPos;
baseStringPos++;
dst++;
while (baseStringPos < baseStringEnd) {
if (baseStringPos[0] == '.' && dst[-1] == '/') {
if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) {
// skip over "." segment
baseStringPos += 2;
continue;
} else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' ||
baseStringPos + 2 == baseStringEnd)) {
// skip over ".." segment and rewind the last segment
// the RFC leaves it up to the app to decide what to do with excess
// ".." segments - we choose to drop them since some web content
// relies on this.
baseStringPos += 3;
if (dst > bufferPathStart + 1)
dst--;
// Note that these two while blocks differ subtly.
// The first helps to remove multiple adjoining slashes as we rewind.
// The +1 to bufferPathStart in the first while block prevents eating a leading slash
while (dst > bufferPathStart + 1 && dst[-1] == '/')
dst--;
while (dst > bufferPathStart && dst[-1] != '/')
dst--;
continue;
}
}
*dst = *baseStringPos;
baseStringPos++;
dst++;
}
}
*dst = '\0';
return dst - bufferPathStart;
}
static inline bool hasSlashDotOrDotDot(const char* str)
{
const unsigned char* p = reinterpret_cast<const unsigned char*>(str);
if (!*p)
return false;
unsigned char pc = *p;
while (unsigned char c = *++p) {
if (c == '.' && (pc == '/' || pc == '.'))
return true;
pc = c;
}
return false;
}
static inline bool matchLetter(char c, char lowercaseLetter)
{
return (c | 0x20) == lowercaseLetter;
}
void KURL::parse(const String& string)
{
checkEncodedString(string);
CharBuffer buffer(string.length() + 1);
copyASCII(string.characters(), string.length(), buffer.data());
buffer[string.length()] = '\0';
parse(buffer.data(), &string);
}
void KURL::parse(const char* url, const String* originalString)
{
if (!url || url[0] == '\0') {
// valid URL must be non-empty
m_string = originalString ? *originalString : url;
invalidate();
return;
}
if (!isSchemeFirstChar(url[0])) {
// scheme must start with an alphabetic character
m_string = originalString ? *originalString : url;
invalidate();
return;
}
int schemeEnd = 0;
while (isSchemeChar(url[schemeEnd]))
schemeEnd++;
if (url[schemeEnd] != ':') {
m_string = originalString ? *originalString : url;
invalidate();
return;
}
int userStart = schemeEnd + 1;
int userEnd;
int passwordStart;
int passwordEnd;
int hostStart;
int hostEnd;
int portStart;
int portEnd;
bool hierarchical = url[schemeEnd + 1] == '/';
bool isFile = schemeEnd == 4
&& matchLetter(url[0], 'f')
&& matchLetter(url[1], 'i')
&& matchLetter(url[2], 'l')
&& matchLetter(url[3], 'e');
m_protocolInHTTPFamily = matchLetter(url[0], 'h')
&& matchLetter(url[1], 't')
&& matchLetter(url[2], 't')
&& matchLetter(url[3], 'p')
&& (url[4] == ':' || (matchLetter(url[4], 's') && url[5] == ':'));
if (hierarchical && url[schemeEnd + 2] == '/') {
// The part after the scheme is either a net_path or an abs_path whose first path segment is empty.
// Attempt to find an authority.
// FIXME: Authority characters may be scanned twice, and it would be nice to be faster.
userStart += 2;
userEnd = userStart;
int colonPos = 0;
while (isUserInfoChar(url[userEnd])) {
if (url[userEnd] == ':' && colonPos == 0)
colonPos = userEnd;
userEnd++;
}
if (url[userEnd] == '@') {
// actual end of the userinfo, start on the host
if (colonPos != 0) {
passwordEnd = userEnd;
userEnd = colonPos;
passwordStart = colonPos + 1;
} else
passwordStart = passwordEnd = userEnd;
hostStart = passwordEnd + 1;
} else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) {
// hit the end of the authority, must have been no user
// or looks like an IPv6 hostname
// either way, try to parse it as a hostname
userEnd = userStart;
passwordStart = passwordEnd = userEnd;
hostStart = userStart;
} else {
// invalid character
m_string = originalString ? *originalString : url;
invalidate();
return;
}
hostEnd = hostStart;
// IPV6 IP address
if (url[hostEnd] == '[') {
hostEnd++;
while (isIPv6Char(url[hostEnd]))
hostEnd++;
if (url[hostEnd] == ']')
hostEnd++;
else {
// invalid character
m_string = originalString ? *originalString : url;
invalidate();
return;
}
} else {
while (isHostnameChar(url[hostEnd]))
hostEnd++;
}
if (url[hostEnd] == ':') {
portStart = portEnd = hostEnd + 1;
// possible start of port
portEnd = portStart;
while (isASCIIDigit(url[portEnd]))
portEnd++;
} else
portStart = portEnd = hostEnd;
if (!isPathSegmentEndChar(url[portEnd])) {
// invalid character
m_string = originalString ? *originalString : url;
invalidate();
return;
}
if (userStart == portEnd && !m_protocolInHTTPFamily && !isFile) {
// No authority found, which means that this is not a net_path, but rather an abs_path whose first two
// path segments are empty. For file, http and https only, an empty authority is allowed.
userStart -= 2;
userEnd = userStart;
passwordStart = userEnd;
passwordEnd = passwordStart;
hostStart = passwordEnd;
hostEnd = hostStart;
portStart = hostEnd;
portEnd = hostEnd;
}
} else {
// the part after the scheme must be an opaque_part or an abs_path
userEnd = userStart;
passwordStart = passwordEnd = userEnd;
hostStart = hostEnd = passwordEnd;
portStart = portEnd = hostEnd;
}
int pathStart = portEnd;
int pathEnd = pathStart;
while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#')
pathEnd++;
int queryStart = pathEnd;
int queryEnd = queryStart;
if (url[queryStart] == '?') {
while (url[queryEnd] && url[queryEnd] != '#')
queryEnd++;
}
int fragmentStart = queryEnd;
int fragmentEnd = fragmentStart;
if (url[fragmentStart] == '#') {
fragmentStart++;
fragmentEnd = fragmentStart;
while (url[fragmentEnd])
fragmentEnd++;
}
// assemble it all, remembering the real ranges
Vector<char, 4096> buffer(fragmentEnd * 3 + 1);
char *p = buffer.data();
const char *strPtr = url;
// copy in the scheme
const char *schemeEndPtr = url + schemeEnd;
while (strPtr < schemeEndPtr)
*p++ = *strPtr++;
m_schemeEnd = p - buffer.data();
bool hostIsLocalHost = portEnd - userStart == 9
&& matchLetter(url[userStart], 'l')
&& matchLetter(url[userStart+1], 'o')
&& matchLetter(url[userStart+2], 'c')
&& matchLetter(url[userStart+3], 'a')
&& matchLetter(url[userStart+4], 'l')
&& matchLetter(url[userStart+5], 'h')
&& matchLetter(url[userStart+6], 'o')
&& matchLetter(url[userStart+7], 's')
&& matchLetter(url[userStart+8], 't');
// File URLs need a host part unless it is just file:// or file://localhost
bool degenFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost);
bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || portStart != portEnd;
// add ":" after scheme
*p++ = ':';
// if we have at least one authority part or a file URL - add "//" and authority
if (isFile ? !degenFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) {
*p++ = '/';
*p++ = '/';
m_userStart = p - buffer.data();
// copy in the user
strPtr = url + userStart;
const char* userEndPtr = url + userEnd;
while (strPtr < userEndPtr)
*p++ = *strPtr++;
m_userEnd = p - buffer.data();
// copy in the password
if (passwordEnd != passwordStart) {
*p++ = ':';
strPtr = url + passwordStart;
const char* passwordEndPtr = url + passwordEnd;
while (strPtr < passwordEndPtr)
*p++ = *strPtr++;
}
m_passwordEnd = p - buffer.data();
// If we had any user info, add "@"
if (p - buffer.data() != m_userStart)
*p++ = '@';
// copy in the host, except in the case of a file URL with authority="localhost"
if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) {
strPtr = url + hostStart;
const char* hostEndPtr = url + hostEnd;
while (strPtr < hostEndPtr)
*p++ = *strPtr++;
}
m_hostEnd = p - buffer.data();
// copy in the port
if (hostEnd != portStart) {
*p++ = ':';
strPtr = url + portStart;
const char *portEndPtr = url + portEnd;
while (strPtr < portEndPtr)
*p++ = *strPtr++;
}
m_portEnd = p - buffer.data();
} else
m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data();
// For canonicalization, ensure we have a '/' for no path.
// Do this only for hierarchical URL with protocol http or https.
if (m_protocolInHTTPFamily && hierarchical && pathEnd == pathStart)
*p++ = '/';
// add path, escaping bad characters
if (!hierarchical || !hasSlashDotOrDotDot(url))
appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart);
else {
CharBuffer pathBuffer(pathEnd - pathStart + 1);
size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd);
appendEscapingBadChars(p, pathBuffer.data(), length);
}
m_pathEnd = p - buffer.data();
// Find the position after the last slash in the path, or
// the position before the path if there are no slashes in it.
int i;
for (i = m_pathEnd; i > m_portEnd; --i) {
if (buffer[i - 1] == '/')
break;
}
m_pathAfterLastSlash = i;
// add query, escaping bad characters
appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart);
m_queryEnd = p - buffer.data();
// add fragment, escaping bad characters
if (fragmentEnd != queryEnd) {
*p++ = '#';
appendEscapingBadChars(p, url + fragmentStart, fragmentEnd - fragmentStart);
}
m_fragmentEnd = p - buffer.data();
ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
// If we didn't end up actually changing the original string and
// it was already in a String, reuse it to avoid extra allocation.
if (originalString && originalString->length() == static_cast<unsigned>(m_fragmentEnd) && strncmp(buffer.data(), url, m_fragmentEnd) == 0)
m_string = *originalString;
else
m_string = String(buffer.data(), m_fragmentEnd);
m_isValid = true;
}
bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b)
{
if (a.m_queryEnd != b.m_queryEnd)
return false;
unsigned queryLength = a.m_queryEnd;
for (unsigned i = 0; i < queryLength; ++i)
if (a.string()[i] != b.string()[i])
return false;
return true;
}
bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b)
{
if (a.m_schemeEnd != b.m_schemeEnd)
return false;
int hostStartA = a.hostStart();
int hostStartB = b.hostStart();
if (a.m_hostEnd - hostStartA != b.m_hostEnd - hostStartB)
return false;
// Check the scheme
for (int i = 0; i < a.m_schemeEnd; ++i)
if (a.string()[i] != b.string()[i])
return false;
// And the host
for (int i = hostStartA; i < a.m_hostEnd; ++i)
if (a.string()[i] != b.string()[i])
return false;
if (a.port() != b.port())
return false;
return true;
}
String encodeWithURLEscapeSequences(const String& notEncodedString)
{
CString asUTF8 = notEncodedString.utf8();
CharBuffer buffer(asUTF8.length() * 3 + 1);
char* p = buffer.data();
const char* str = asUTF8.data();
const char* strEnd = str + asUTF8.length();
while (str < strEnd) {
unsigned char c = *str++;
if (isBadChar(c)) {
*p++ = '%';
*p++ = hexDigits[c >> 4];
*p++ = hexDigits[c & 0xF];
} else
*p++ = c;
}
ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
return String(buffer.data(), p - buffer.data());
}
// Appends the punycoded hostname identified by the given string and length to
// the output buffer. The result will not be null terminated.
static void appendEncodedHostname(UCharBuffer& buffer, const UChar* str, unsigned strLen)
{
// Needs to be big enough to hold an IDN-encoded name.
// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
const unsigned hostnameBufferLength = 2048;
if (strLen > hostnameBufferLength || charactersAreAllASCII(str, strLen)) {
buffer.append(str, strLen);
return;
}
#if USE(ICU_UNICODE)
UChar hostnameBuffer[hostnameBufferLength];
UErrorCode error = U_ZERO_ERROR;
int32_t numCharactersConverted = uidna_IDNToASCII(str, strLen, hostnameBuffer,
hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error);
if (error == U_ZERO_ERROR)
buffer.append(hostnameBuffer, numCharactersConverted);
#elif USE(QT4_UNICODE)
QByteArray result = QUrl::toAce(String(str, strLen));
buffer.append(result.constData(), result.length());
#elif USE(GLIB_UNICODE)
GOwnPtr<gchar> utf8Hostname;
GOwnPtr<GError> utf8Err;
utf8Hostname.set(g_utf16_to_utf8(str, strLen, 0, 0, &utf8Err.outPtr()));
if (utf8Err)
return;
GOwnPtr<gchar> encodedHostname;
encodedHostname.set(g_hostname_to_ascii(utf8Hostname.get()));
if (!encodedHostname)
return;
buffer.append(encodedHostname.get(), strlen(encodedHostname.get()));
#endif
}
static void findHostnamesInMailToURL(const UChar* str, int strLen, Vector<pair<int, int> >& nameRanges)
{
// In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character.
// Skip quoted strings so that characters in them don't confuse us.
// When we find a '?' character, we are past the part of the URL that contains host names.
nameRanges.clear();
int p = 0;
while (1) {
// Find start of host name or of quoted string.
int hostnameOrStringStart = findFirstOf(str, strLen, p, "\"@?");
if (hostnameOrStringStart == -1)
return;
UChar c = str[hostnameOrStringStart];
p = hostnameOrStringStart + 1;
if (c == '?')
return;
if (c == '@') {
// Find end of host name.
int hostnameStart = p;
int hostnameEnd = findFirstOf(str, strLen, p, ">,?");
bool done;
if (hostnameEnd == -1) {
hostnameEnd = strLen;
done = true;
} else {
p = hostnameEnd;
done = false;
}
nameRanges.append(make_pair(hostnameStart, hostnameEnd));
if (done)
return;
} else {
// Skip quoted string.
ASSERT(c == '"');
while (1) {
int escapedCharacterOrStringEnd = findFirstOf(str, strLen, p, "\"\\");
if (escapedCharacterOrStringEnd == -1)
return;
c = str[escapedCharacterOrStringEnd];
p = escapedCharacterOrStringEnd + 1;
// If we are the end of the string, then break from the string loop back to the host name loop.
if (c == '"')
break;
// Skip escaped character.
ASSERT(c == '\\');
if (p == strLen)
return;
++p;
}
}
}
}
static bool findHostnameInHierarchicalURL(const UChar* str, int strLen, int& startOffset, int& endOffset)
{
// Find the host name in a hierarchical URL.
// It comes after a "://" sequence, with scheme characters preceding, and
// this should be the first colon in the string.
// It ends with the end of the string or a ":" or a path segment ending character.
// If there is a "@" character, the host part is just the part after the "@".
int separator = findFirstOf(str, strLen, 0, ":");
if (separator == -1 || separator + 2 >= strLen ||
str[separator + 1] != '/' || str[separator + 2] != '/')
return false;
// Check that all characters before the :// are valid scheme characters.
if (!isSchemeFirstChar(str[0]))
return false;
for (int i = 1; i < separator; ++i) {
if (!isSchemeChar(str[i]))
return false;
}
// Start after the separator.
int authorityStart = separator + 3;
// Find terminating character.
int hostnameEnd = strLen;
for (int i = authorityStart; i < strLen; ++i) {
UChar c = str[i];
if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) {
hostnameEnd = i;
break;
}
}
// Find "@" for the start of the host name.
int userInfoTerminator = findFirstOf(str, strLen, authorityStart, "@");
int hostnameStart;
if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd)
hostnameStart = authorityStart;
else
hostnameStart = userInfoTerminator + 1;
startOffset = hostnameStart;
endOffset = hostnameEnd;
return true;
}
// Converts all hostnames found in the given input to punycode, preserving the
// rest of the URL unchanged. The output will NOT be null-terminated.
static void encodeHostnames(const String& str, UCharBuffer& output)
{
output.clear();
if (protocolIs(str, "mailto")) {
Vector<pair<int, int> > hostnameRanges;
findHostnamesInMailToURL(str.characters(), str.length(), hostnameRanges);
int n = hostnameRanges.size();
int p = 0;
for (int i = 0; i < n; ++i) {
const pair<int, int>& r = hostnameRanges[i];
output.append(&str.characters()[p], r.first - p);
appendEncodedHostname(output, &str.characters()[r.first], r.second - r.first);
p = r.second;
}
// This will copy either everything after the last hostname, or the
// whole thing if there is no hostname.
output.append(&str.characters()[p], str.length() - p);
} else {
int hostStart, hostEnd;
if (findHostnameInHierarchicalURL(str.characters(), str.length(), hostStart, hostEnd)) {
output.append(str.characters(), hostStart); // Before hostname.
appendEncodedHostname(output, &str.characters()[hostStart], hostEnd - hostStart);
output.append(&str.characters()[hostEnd], str.length() - hostEnd); // After hostname.
} else {
// No hostname to encode, return the input.
output.append(str.characters(), str.length());
}
}
}
static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output)
{
UCharBuffer s;
encodeHostnames(rel, s);
TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme.
int pathEnd = -1;
if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) {
// Find the first instance of either # or ?, keep pathEnd at -1 otherwise.
pathEnd = findFirstOf(s.data(), s.size(), 0, "#?");
}
if (pathEnd == -1) {
CString decoded = pathEncoding.encode(s.data(), s.size(), URLEncodedEntitiesForUnencodables);
output.resize(decoded.length());
memcpy(output.data(), decoded.data(), decoded.length());
} else {
CString pathDecoded = pathEncoding.encode(s.data(), pathEnd, URLEncodedEntitiesForUnencodables);
// Unencodable characters in URLs are represented by converting
// them to XML entities and escaping non-alphanumeric characters.
CString otherDecoded = encoding.encode(s.data() + pathEnd, s.size() - pathEnd, URLEncodedEntitiesForUnencodables);
output.resize(pathDecoded.length() + otherDecoded.length());
memcpy(output.data(), pathDecoded.data(), pathDecoded.length());
memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length());
}
output.append('\0'); // null-terminate the output.
}
static String substituteBackslashes(const String& string)
{
int questionPos = string.find('?');
int hashPos = string.find('#');
int pathEnd;
if (hashPos >= 0 && (questionPos < 0 || questionPos > hashPos))
pathEnd = hashPos;
else if (questionPos >= 0)
pathEnd = questionPos;
else
pathEnd = string.length();
return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd);
}
bool KURL::isHierarchical() const
{
if (!m_isValid)
return false;
ASSERT(m_string[m_schemeEnd] == ':');
return m_string[m_schemeEnd + 1] == '/';
}
void KURL::copyToBuffer(CharBuffer& buffer) const
{
// FIXME: This throws away the high bytes of all the characters in the string!
// That's fine for a valid URL, which is all ASCII, but not for invalid URLs.
buffer.resize(m_string.length());
copyASCII(m_string.characters(), m_string.length(), buffer.data());
}
bool protocolIs(const String& url, const char* protocol)
{
// Do the comparison without making a new string object.
assertProtocolIsGood(protocol);
for (int i = 0; ; ++i) {
if (!protocol[i])
return url[i] == ':';
if (toASCIILower(url[i]) != protocol[i])
return false;
}
}
bool protocolIsJavaScript(const String& url)
{
return protocolIs(url, "javascript");
}
bool isValidProtocol(const String& protocol)
{
// RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
if (protocol.isEmpty())
return false;
if (!isSchemeFirstChar(protocol[0]))
return false;
unsigned protocolLength = protocol.length();
for (unsigned i = 1; i < protocolLength; i++) {
if (!isSchemeChar(protocol[i]))
return false;
}
return true;
}
bool isDefaultPortForProtocol(unsigned short port, const String& protocol)
{
if (protocol.isEmpty())
return false;
typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap;
DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ());
if (defaultPorts.isEmpty()) {
defaultPorts.set("http", 80);
defaultPorts.set("https", 443);
defaultPorts.set("ftp", 21);
defaultPorts.set("ftps", 990);
}
return defaultPorts.get(protocol) == port;
}
bool portAllowed(const KURL& url)
{
unsigned short port = url.port();
// Since most URLs don't have a port, return early for the "no port" case.
if (!port)
return true;
// This blocked port list matches the port blocking that Mozilla implements.
// See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
static const unsigned short blockedPortList[] = {
1, // tcpmux
7, // echo
9, // discard
11, // systat
13, // daytime
15, // netstat
17, // qotd
19, // chargen
20, // FTP-data
21, // FTP-control
22, // SSH
23, // telnet
25, // SMTP
37, // time
42, // name
43, // nicname
53, // domain
77, // priv-rjs
79, // finger
87, // ttylink
95, // supdup
101, // hostriame
102, // iso-tsap
103, // gppitnp
104, // acr-nema
109, // POP2
110, // POP3
111, // sunrpc
113, // auth
115, // SFTP
117, // uucp-path
119, // nntp
123, // NTP
135, // loc-srv / epmap
139, // netbios
143, // IMAP2
179, // BGP
389, // LDAP
465, // SMTP+SSL
512, // print / exec
513, // login
514, // shell
515, // printer
526, // tempo
530, // courier
531, // Chat
532, // netnews
540, // UUCP
556, // remotefs
563, // NNTP+SSL
587, // ESMTP
601, // syslog-conn
636, // LDAP+SSL
993, // IMAP+SSL
995, // POP3+SSL
2049, // NFS
3659, // apple-sasl / PasswordServer [Apple addition]
4045, // lockd
6000, // X11
6665, // Alternate IRC [Apple addition]
6666, // Alternate IRC [Apple addition]
6667, // Standard IRC [Apple addition]
6668, // Alternate IRC [Apple addition]
6669, // Alternate IRC [Apple addition]
};
const unsigned short* const blockedPortListEnd = blockedPortList + sizeof(blockedPortList) / sizeof(blockedPortList[0]);
#ifndef NDEBUG
// The port list must be sorted for binary_search to work.
static bool checkedPortList = false;
if (!checkedPortList) {
for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p)
ASSERT(*p < *(p + 1));
checkedPortList = true;
}
#endif
// If the port is not in the blocked port list, allow it.
if (!binary_search(blockedPortList, blockedPortListEnd, port))
return true;
// Allow ports 21 and 22 for FTP URLs, as Mozilla does.
if ((port == 21 || port == 22) && url.protocolIs("ftp"))
return true;
// Allow any port number in a file URL, since the port number is ignored.
if (url.protocolIs("file"))
return true;
return false;
}
String mimeTypeFromDataURL(const String& url)
{
ASSERT(protocolIs(url, "data"));
int index = url.find(';');
if (index == -1)
index = url.find(',');
if (index != -1) {
int len = index - 5;
if (len > 0)
return url.substring(5, len);
return "text/plain"; // Data URLs with no MIME type are considered text/plain.
}
return "";
}
const KURL& blankURL()
{
DEFINE_STATIC_LOCAL(KURL, staticBlankURL, (ParsedURLString, "about:blank"));
return staticBlankURL;
}
#ifndef NDEBUG
void KURL::print() const
{
printf("%s\n", m_string.utf8().data());
}
#endif
}
#endif // !USE(GOOGLEURL)