# encoding: ASCII-8BIT

# iExploder - Generates bad HTML files to perform QA for web browsers.
#
# Copyright 2010 Thomas Stromberg - All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

require 'cgi'
require 'yaml'

require './scanner.rb'
require './version.rb'

# Used to speed up subtest generation
$TEST_CACHE = {}

# Media extensions to proper mime type map (not that we always listen'
$MIME_MAP = {
  'bmp' => 'image/bmp',
  'gif' => 'image/gif',
  'jpg' => 'image/jpeg',
  'png' => 'image/png',
  'svg' => 'image/svg+xml',
  'tiff' => 'image/tiff',
  'xbm' => 'image/xbm',
  'ico' => 'image/x-icon',
  'jng' => 'image/x-jng',
  'xpm' => 'image/x-portable-pixmap',
  'ogg' => 'audio/ogg',
  'snd' => 'audio/basic',
  'wav' => 'audio/wav'
}

# These tags get src properties more often than others
$SRC_TAGS = ['img', 'audio', 'video', 'embed']

class IExploder
  attr_accessor :test_num, :subtest_data, :lookup_mode, :random_mode, :cgi_url, :browser, :claimed_browser
  attr_accessor :offset, :lines, :stop_num, :config

  def initialize(config_path)
    @config = YAML::load(File.open(config_path))
    @stop_num = nil
    @subtest_data = nil
    @test_num = 0
    @cgi_url = '/iexploder.cgi'
    @browser = 'UNKNOWN'
    @claimed_browser = nil
    readTagFiles()
    return nil
  end

  def setRandomSeed
    if @test_num > 0
      srand(@test_num)
    else
      srand
    end
  end


  def readTagFiles
    # These if statements are so that mod_ruby doesn't have to reload the files
    # each time
    data_path = @config['mangle_data_path']
    @cssTags = readTagsDir("#{data_path}/css-properties")
    @cssPseudoTags = readTagsDir("#{data_path}/css-pseudo")
    @cssAtRules = readTagsDir("#{data_path}/css-atrules")
    @htmlTags = readTagsDir("#{data_path}/html-tags")
    @htmlAttr = readTagsDir("#{data_path}/html-attrs")
    @htmlValues = readTagsDir("#{data_path}/html-values")
    @cssValues = readTagsDir("#{data_path}/css-values")
    @headerValues = readTagsDir("#{data_path}/headers")
    @protocolValues = readTagsDir("#{data_path}/protocols")
    @mimeTypes = readTagsDir("#{data_path}/mime-types")
    @media = readMediaDir("#{data_path}/media")
  end

  def readTagsDir(directory)
    values = []
    Dir.foreach(directory) { |filename|
      if File.file?(directory + "/" + filename)
        values = values + readTagFile(directory + "/" + filename)
      end
    }
    return values.uniq
  end

  def readMediaDir(directory)
    data = {}
    Dir.foreach(directory) { |filename|
      if File.file?(directory + "/" + filename)
       (base, extension) = filename.split('.')
        mime_type = $MIME_MAP[extension]
        data[mime_type] = File.read(directory + "/" + filename)
      end
    }
    return data
  end

  def readTagFile(filename)
    list = Array.new
    File.new(filename).readlines.each { |line|
      line.chop!

      # Don't include comments.
      if (line !~ /^# /) && (line.length > 0)
        list << line
      end
    }
    return list
  end


  def generateHtmlValue(tag)
    choice = rand(100)
    tag = tag.sub('EXCLUDED_', '')
    if tag =~ /^on/ and choice < 90
      return generateHtmlValue('') + "()"
    elsif tag == 'src' or tag == 'data' or tag == 'profile' and choice < 90
      return generateGarbageUrl(tag)
    end

    case choice
      when 0..50 then
        return @htmlValues[rand(@htmlValues.length)]
      when 51..75
        return generateGarbageNumber()
      when 76..85
        return generateGarbageValue()
      when 86..90
        return generateGarbageNumber() + ',' + generateGarbageNumber()
      when 91..98
        return generateGarbageUrl(tag)
    else
      return generateOverflow()
    end
  end

  def generateMediaUrl(tag)
    mime_type = @media.keys[rand(@media.keys.length)]
    return generateTestUrl(@test_num, nil, nil, mime_type)
  end

  def generateGarbageUrl(tag)
    choice = rand(100)
    case choice
      when 0..30
      return generateMediaUrl(tag)
      when 31..50
      return @protocolValues[rand(@protocolValues.length)] + '%' + generateGarbageValue()
      when 51..60
      return @protocolValues[rand(@protocolValues.length)] + '//../' + generateGarbageValue()
      when 60..75
      return @protocolValues[rand(@protocolValues.length)] + '//' + generateGarbageValue()
      when 75..85
      return generateOverflow() + ":" + generateGarbageValue()
      when 86..97
      return generateGarbageValue() + ":" + generateOverflow()
    else
      return generateOverflow()
    end
  end

  def generateCssValue(property)
    size_types = ['', 'em', 'px', '%', 'pt', 'pc', 'ex', 'in', 'cm', 'mm']

    choice = rand(100)
    case choice
      when 0..50 then
      # return the most likely scenario
      case property.sub('EXCLUDED_', '')
        when /-image|content/
          return 'url(' + generateGarbageUrl(property) + ')'
        when /-width|-radius|-spacing|margin|padding|height/
          return generateGarbageValue() + size_types[rand(size_types.length)]
        when /-color/
          return generateGarbageColor()
        when /-delay|-duration/
          return generateGarbageValue() + 'ms'
      else
        return @cssValues[rand(@cssValues.length)]
      end
      when 51..75 then return generateGarbageNumber()
      when 76..85 then return 'url(' + generateGarbageUrl(property) + ')'
      when 85..98 then return generateGarbageValue()
    else
      return generateOverflow()
    end
  end

  def generateGarbageColor()
    case rand(100)
      when 0..50 then return '#' + generateGarbageValue()
      when 51..70 then return 'rgb(' + generateGarbageNumber() + ',' + generateGarbageNumber() + ',' + generateGarbageNumber() + ')'
      when 71..98 then return 'rgb(' + generateGarbageNumber() + '%,' + generateGarbageNumber() + '%,' + generateGarbageNumber() + '%)'
    else
      return generateOverflow()
    end
  end

  def generateGarbageNumber()
    choice = rand(100)
    case choice
      when 0 then return '0'
      when 1..40 then return '9' * rand(100)
      when 41..60 then return '999999.' + rand(999999999999999999999).to_s
      when 61..80 then return '-' + ('9' * rand(100))
      when 81..90 then return '-999999.' + rand(999999999999999999999).to_s
      when 91..98 then return generateGarbageText()
    else
      return generateOverflow()
    end
  end

  def generateGarbageValue()
    case rand(100)
      when 0..30 then return rand(255).chr * rand(@config['buffer_overflow_length'])
      when 31..50 then return "%n" * 50
      when 51..65 then return ("&#" + rand(999999).to_s + ";") * rand(@config['max_garbage_text_size'])
      when 66..70 then
      junk = []
      0.upto(rand(20)+1) do
        junk << "\\x" + rand(65535).to_s(16)
      end
      return junk.join('') * rand(@config['max_garbage_text_size'])
      when 71..99 then
      junk = []
      chars = '%?!$#^0123456789ABCDEF%#./\&|;'
      0.upto(rand(20)+1) do
        junk << chars[rand(chars.length)].chr
      end
      return junk.join('') * rand(@config['max_garbage_text_size'])
    end
  end

  def generateOverflow()
    return rand(255).chr * (@config['buffer_overflow_length'] + rand(500))
  end

  def generateGarbageText
    case rand(100)
      when 0..70 then return 'X' * 129
      when 71..75 then return "%n" * 15
      when 76..85 then return ("&#" + rand(9999999999999).to_s + ";") * rand(@config['max_garbage_text_size'])
      when 86..90 then return generateGarbageValue()
      when 91..98 then return rand(255).chr * rand(@config['max_garbage_text_size'])
    else
      return generateOverflow()
    end
  end

  def isPropertyInBlacklist(properties)
    # Format: [img, src] or [img, style, property]
    blacklist_entries = []
    if @config.has_key?('exclude') and @config['exclude']
      blacklist_entries << properties.join('.')
      wildcard_property = properties.dup
      wildcard_property[0] = '*'
      blacklist_entries << wildcard_property.join('.')
      blacklist_entries.each do |entry|
        if @config['exclude'].has_key?(entry) and @browser =~ /#{@config['exclude'][entry]}/
          return true
        end
      end
    end
    return false
  end

  def generateCssStyling(tag)
    out = ' style="'
    0.upto(rand(@config['properties_per_style_max'])) {
      property = @cssTags[rand(@cssTags.length)]
      if isPropertyInBlacklist([tag, 'style', property])
        property = "EXCLUDED_#{property}"
      end
      out << property

      # very small chance we let the tag run on.
      if rand(65) > 1
        out << ": "
      end

      values = []
      0.upto(rand(@config['attributes_per_style_property_max'])) {
        values << generateCssValue(property)
      }
      out << values.join(' ')
      # we almost always put the ; there.
      if rand(65) > 1
        out << ";\n    "
      end
    }
    out << "\""
    return out
  end

  def mangleTag(tag, no_close_chance=false)
    if not no_close_chance and rand(100) < 15
      return "</" + tag + ">"
    end
    out = "<" + tag
    if rand(100) > 1
      out << ' '
    else
      out << generateOverflow()
    end

    attrNum = rand(@config['attributes_per_html_tag_max']) + 1
    attrs = []
    # The HTML head tag does not have many useful attributes, but is always included in tests.
    if tag == 'head' and rand(100) < 75
      case rand(3)
        when 0 then attrs << 'lang'
        when 1 then attrs << 'dir'
        when 2 then attrs << 'profile'
      end
    end
    # 75% of the time, these tags get a src attribute
    if $SRC_TAGS.include?(tag) and rand(100) < 75
      if @config.has_key?('exclude') and @config['exclude'] and @config['exclude'].has_key?("#{tag}.src")
        attrs << 'EXCLUDED_src'
      else
        attrs << 'src'
      end
    end

    while attrs.length < attrNum
      attribute = @htmlAttr[rand(@htmlAttr.length)]
      if isPropertyInBlacklist([tag, attribute])
        attribute = "EXCLUDED_#{attribute}"
      end
      attrs << attribute
    end

    # Add a few HTML attributes
    for attr in attrs
      out << attr
      if rand(100) > 1
        out << '='
      end
      if (rand(100) >= 50)
        quoted = 1
        out << "\""
      else
        quoted = nil
      end
      out << generateHtmlValue(attr)
      if quoted
        if rand(100) >= 10
          out << "\""
        end
      end
      if rand(100) >= 1
        out << "\n  "
      end
    end

    if rand(100) >= 25
      out << generateCssStyling(tag)
    end
    out << ">\n"
    return out
  end

  def nextTestNum()
    if @subtest_data
      return @test_num
    elsif @random_mode
      return rand(99999999999)
    else
      return @test_num  + 1
    end
  end

  def generateCssPattern()
    # Generate a CSS selector pattern.
    choice = rand(100)
    pattern = ''
    case choice
      when 0..84 then pattern = @htmlTags[rand(@htmlTags.length)].dup
      when 85..89 then pattern = "*"
      when 90..94 then pattern = @cssAtRules[rand(@cssAtRules.length)].dup
      when 95..100 then pattern = ''
    end

    if rand(100) < 25
      pattern << " " + @htmlTags[rand(@htmlTags.length)]
    end

    if rand(100) < 25
      pattern << " > " + @htmlTags[rand(@htmlTags.length)]
    end

    if rand(100) < 25
      pattern << " + " + @htmlTags[rand(@htmlTags.length)]
    end

    if rand(100) < 10
      pattern << "*"
    end


    if rand(100) < 25
      pseudo = @cssPseudoTags[rand(@cssPseudoTags.length)].dup
      # These tags typically have a parenthesis
      if (pseudo =~ /^lang|^nth|^not/ and rand(100) < 75 and pseudo !~ /\(/) or rand(100) < 20
        pseudo << '('
      end

      if pseudo =~ /\(/
        if rand(100) < 75
          pseudo << generateGarbageValue()
        end
        if rand(100) < 75
          pseudo << ')'
        end
      end
      pattern << ":" + pseudo
    end

    if rand(100) < 20
      html_attr = @htmlAttr[rand(@htmlAttr.length)]
      match = '[' + html_attr
      choice = rand(100)
      garbage = generateGarbageValue()
      case choice
        when 0..25 then match << ']'
        when 26..50 then match << "=\"#{garbage}\"]"
        when 51..75 then match << "=~\"#{garbage}\"]"
        when 76..99 then match << "|=\"#{garbage}\"]"
      end
      pattern << match
    end

    if rand(100) < 20
      if rand(100) < 50
        pattern << '.' + generateGarbageValue()
      else
        pattern << '.*'
      end
    end

    if rand(100) < 20
      pattern << '#' + generateGarbageValue()
    end

    if rand(100) < 5
      pattern << ' #' + generateGarbageValue()
    end

    return pattern
  end

  def buildStyleTag()
    out = "\n"
    0.upto(rand(@config['properties_per_style_max'])) {
      out << generateCssPattern()
      if rand(100) < 90
        out << " {\n"
      end

      0.upto(rand(@config['properties_per_style_max'])) {
        property = @cssTags[rand(@cssTags.length)].dup
        if isPropertyInBlacklist(['style', 'style', property])
          property = "  EXCLUDED_#{property}"
        end
        out << "  #{property}: "

        values = []
        0.upto(rand(@config['attributes_per_style_property_max'])) {
          values << generateCssValue(property)
        }
        out << values.join(' ')
        if rand(100) < 95
          out << ";\n"
        end
      }
      if rand(100) < 90
        out << "\n}\n"
      end

    }
    return out
  end


  # Build any malicious javascript here. Fairly naive at the moment.
  def buildJavaScript
    target = @htmlTags[rand(@htmlTags.length)]
    css_property = @cssTags[rand(@cssTags.length)]
    css_property2 = @cssTags[rand(@cssTags.length)]
    html_attr = @htmlAttr[rand(@htmlAttr.length)]
    css_value = generateCssValue(css_property)
    html_value = generateHtmlValue(html_attr)
    html_value2 = generateGarbageNumber()
    mangled = mangleTag(@htmlTags[rand(@htmlTags.length)]);
    mangled2 = mangleTag(@htmlTags[rand(@htmlTags.length)]);

    js = []
    js << "window.onload=function(){"
    js << "  var ietarget = document.createElement('#{target}');"
    js << "  ietarget.style.#{css_property} = '#{css_value}';"
    js << "  ietarget.#{html_attr} = '#{html_value}';"
    js << "  document.body.appendChild(ietarget);"
    js << "  ietarget.style.#{css_property2} = #{html_value2};"

    js << "  document.write('#{mangled}');"
    js << "  document.write('#{mangled2}');"
    js << "}"
    return js.join("\n")
  end

  def buildMediaFile(mime_type)
    if @media.has_key?(mime_type)
      data = @media[mime_type].dup
    else
      puts "No media found for #{mime_type}"
      data = generateGarbageText()
    end

    # corrupt it in a subtle way
    choice = rand(100)
    if choice > 50
      garbage = generateGarbageValue()
    else
      garbage = rand(255).chr * rand(8)
    end

    if "1.9".respond_to?(:encoding)
      garbage.force_encoding('ASCII-8BIT')
      data.force_encoding('ASCII-8BIT')
    end

    garbage_start = rand(data.length)
    garbage_end = garbage_start + garbage.length
    data[garbage_start..garbage_end] = garbage
    if rand(100) < 15
      data << generateGarbageValue()
    end
    return data
  end

  # Parse the subtest data passed in as part of the URL
  def parseSubTestData(subtest_data)
    # Initialize with one line at 0
    if not subtest_data or subtest_data.to_i == 0
      return [@config['initial_subtest_width'], [0]]
    end
     (lines_at_time, offsets_string) = subtest_data.split('_')
    offsets = offsets_string.split(',').map! {|x| x.to_i }
    return [lines_at_time.to_i, offsets]
  end

  def generateTestUrl(test_num, subtest_width=nil, subtest_offsets=nil, mime_type=nil)
    url = @cgi_url + '?'
    if subtest_width
      if subtest_offsets.length > @config['subtest_combinations_max']
        url << "t=" << test_num.to_s << "&l=test_redirect&z=THE_END"
      else
        url << "t=" << test_num.to_s << "&s=" << subtest_width.to_s << "_" << subtest_offsets.join(',')
      end
    else
      url << "t=" << test_num.to_s
    end

    if @random_mode
      url << "&r=1"
    elsif @stop_num
      url << "&x=" << @stop_num.to_s
    end

    if mime_type
      url << '&m=' + CGI::escape(mime_type)
    end

    url << "&b=" << CGI::escape(@browser)
    return url
  end

  def buildBodyTags(tag_count)
    tagList = ['body']
    # subtract the <body> tag from tag_count.
    1.upto(tag_count-1) { tagList << @htmlTags[rand(@htmlTags.length)] }

    # Lean ourselves toward lots of img and src tests
    for tag, percent in @config['favor_html_tags']
      if rand(100) < percent.to_f
        # Don't overwrite the body tag.
        tagList[rand(tagList.length-1)+1] = tag
      end
    end

    # Now we have our hitlist of tags,lets mangle them.
    mangled_tags = []
    tagList.each do |tag|
      tag_data = mangleTag(tag)
      if tag == 'script'
        if rand(100) < 40
          tag_data = "<script>"
        end
        tag_data << buildJavaScript() + "\n" + "</script>\n"
      elsif tag == 'style'
        if rand(100) < 40
          tag_data = "<style>"
        end
        tag_data << buildStyleTag() + "\n" + "</style>\n"
      elsif rand(100) <= 90
        tag_data << generateGarbageText() << "\n"
      else
        tag_data << "\n"
      end

      if rand(100) <= 33
        tag_data << "</#{tag}>\n"
      end
      mangled_tags << "\n<!-- START #{tag} -->\n" + tag_data + "\n<!-- END #{tag} -->\n"
    end
    return mangled_tags
  end

  def buildHeaderTags(tag_count)
    valid_head_tags = ['title', 'base', 'link', 'meta']
    header_tags = ['html', 'head']
    1.upto(tag_count-1) { header_tags << valid_head_tags[rand(valid_head_tags.length)] }
    header_tags << @htmlTags[rand(@htmlTags.length)]
    mangled_tags = []
    header_tags.each do |tag|
      mangled_tags << mangleTag(tag, no_close_chance=true)
    end
    return mangled_tags
  end

  def buildSurvivedPage(page_type)
    page = "<html><head>"
    page << "<body>Bummer. You survived both redirects. Let me go sulk in the corner.</body>"
    page << "</html>"
    return page
  end

  def buildRedirect(test_num, subtest_data, lookup_mode, stop_num=nil)
    # no more redirects.
    if lookup_mode == '1' or stop_num == test_num
      return ''
    end

    if subtest_data
      width, offsets = parseSubTestData(@subtest_data)
    else
      width, offsets = nil
    end

    # We still need a redirect, but don't bother generating new data.
    if lookup_mode
      redirect_url = generateTestUrl(test_num, width, offsets)
      if lookup_mode == 'test_redirect'
        redirect_url << "&l=test_another_redirect"
      elsif lookup_mode == 'test_another_redirect'
        redirect_url << "&l=survived_redirect"
      else
        redirect_url << "&l=#{lookup_mode}"
      end
    else
      # This is a normal redirect going on to the next page. If we have subtest, get the next one.
      if subtest_data
        width, offsets = combine_combo_creator(@config['html_tags_per_page'], width, offsets)[0..1]
      end
      redirect_url = generateTestUrl(nextTestNum(), width, offsets)
    end

    redirect_code = "\t<META HTTP-EQUIV=\"Refresh\" content=\"0;URL=#{redirect_url}\">\n"
    # use both techniques, because you never know how you might be corrupting yourself.
    redirect_code << "\t<script language=\"javascript\">setTimeout('window.location=\"#{redirect_url}\"', 1000);</script>\n"
    return redirect_code
  end

  def buildPage()
    if @lookup_mode == 'survived_redirect'
      return self.buildSurvivedPage(@lookup_mode)
    end
    tag_count = @config['html_tags_per_page']

    if $TEST_CACHE.has_key?(@test_num)
     (header_tags, body_tags) = $TEST_CACHE[@test_num]
    else
      header_tags = buildHeaderTags(3)
      body_tags = buildBodyTags(tag_count - header_tags.length)
    end
    required_tags = {
      0 => 'html',
      1 => 'head',
      header_tags.length => 'body'
    }

    if @subtest_data and @subtest_data.length > 0
      if not $TEST_CACHE.has_key?(@test_num)
        $TEST_CACHE[@test_num] = [header_tags, body_tags]
      end
      (width, offsets) = parseSubTestData(@subtest_data)
      lines = combine_combo_creator(tag_count, width, offsets)[2]
      all_tags = header_tags + body_tags
      body_start = header_tags.length
      header_tags = []
      body_tags = []
      # <html> and <body> are required, regardless of their existence in the subtest data.
      0.upto(tag_count) do |line_number|
        tag_data = nil
        if lines.include?(line_number)
          tag_data = all_tags[line_number]
        elsif required_tags.key?(line_number)
          tag_data = "<" + required_tags[line_number] + ">"
        end
        if tag_data
          if line_number < body_start
            header_tags << tag_data
          else
            body_tags << tag_data
          end
        end
      end
      header_tags << "<!-- subtest mode: #{offsets.length} combinations, width: #{width} -->"
    end

    htmlText = header_tags[0..1].join("\n\t")
    htmlText << buildRedirect(@test_num, @subtest_data, @lookup_mode, @stop_num)
    htmlText << "<title>[#{@test_num}:#{@subtest_data}] iExploder #{$VERSION} - #{generateGarbageText()}</title>\n"
    if @claimed_browser and @claimed_browser.length > 1
      show_browser = @claimed_browser
    else
      show_browser = @browser
    end
    htmlText << "\n<!-- iExploder #{$VERSION} | test #{@test_num}:#{@subtest_data} at #{Time.now} -->\n"
    htmlText << "<!-- browser: #{show_browser} -->\n"
    htmlText << header_tags[2..-1].join("\n\t")
    htmlText << "\n</head>\n\n"
    htmlText << body_tags.join("\n")
    htmlText << "</body>\n</html>"
    return htmlText
  end

  def buildHeaders(mime_type)
    use_headers = []
    banned_headers = []
    response = {'Content-Type' => mime_type}
    0.upto(rand(@config['headers_per_page_max'])) do
      try_header = @headerValues[rand(@headerValues.length)]
      if ! banned_headers.include?(try_header.downcase)
        use_headers << try_header
      end
    end
    for header in use_headers.uniq
      if rand(100) > 75
        response[header] = generateGarbageNumber()
      else
        response[header] = generateGarbageUrl(header)
      end
    end
    return response
  end
end


# for testing
if $0 == __FILE__
  ie = IExploder.new('config.yaml')
  ie.test_num = ARGV[0].to_i || 1
  ie.subtest_data = ARGV[1] || nil
  mime_type = ARGV[2] || nil
  ie.setRandomSeed()
  if not mime_type
    html_output = ie.buildPage()
    puts html_output
  else
    headers = ie.buildHeaders(mime_type)
    for (key, value) in headers
      puts "#{key}: #{value}"
    end
    puts "Mime-Type: #{mime_type}"
    puts ie.buildMediaFile(mime_type)
  end
end