/**************************************************************************
 * 
 * Copyright 2007 VMware, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 **************************************************************************/

/**
 * \file
 * Build post-transformation, post-clipping vertex buffers and element
 * lists by hooking into the end of the primitive pipeline and
 * manipulating the vertex_id field in the vertex headers.
 *
 * XXX: work in progress 
 * 
 * \author José Fonseca <jfonseca@vmware.com>
 * \author Keith Whitwell <keithw@vmware.com>
 */


#include "draw/draw_context.h"
#include "draw/draw_vbuf.h"
#include "util/u_debug.h"
#include "util/u_inlines.h"
#include "util/u_math.h"
#include "util/u_memory.h"
#include "util/u_fifo.h"

#include "i915_context.h"
#include "i915_reg.h"
#include "i915_batch.h"
#include "i915_state.h"


#define VBUF_MAP_BUFFER

/**
 * Primitive renderer for i915.
 */
struct i915_vbuf_render {
   struct vbuf_render base;

   struct i915_context *i915;

   /** Vertex size in bytes */
   size_t vertex_size;

   /** Software primitive */
   unsigned prim;

   /** Hardware primitive */
   unsigned hwprim;

   /** Genereate a vertex list */
   unsigned fallback;

   /* Stuff for the vbo */
   struct i915_winsys_buffer *vbo;
   size_t vbo_size; /**< current size of allocated buffer */
   size_t vbo_alloc_size; /**< minimum buffer size to allocate */
   size_t vbo_hw_offset; /**< offset that we program the hardware with */
   size_t vbo_sw_offset; /**< offset that we work with */
   size_t vbo_index; /**< index offset to be added to all indices */
   void *vbo_ptr;
   size_t vbo_max_used;
   size_t vbo_max_index; /**< index offset to be added to all indices */

#ifndef VBUF_MAP_BUFFER
   size_t map_used_start;
   size_t map_used_end;
   size_t map_size;
#endif
};


/**
 * Basically a cast wrapper.
 */
static inline struct i915_vbuf_render *
i915_vbuf_render(struct vbuf_render *render)
{
   assert(render);
   return (struct i915_vbuf_render *)render;
}

/**
 * If vbo state differs between renderer and context
 * push state to the context. This function pushes
 * hw_offset to i915->vbo_offset and vbo to i915->vbo.
 *
 * Side effects:
 *    May updates context vbo_offset and vbo fields.
 */
static void
i915_vbuf_update_vbo_state(struct vbuf_render *render)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   struct i915_context *i915 = i915_render->i915;

   if (i915->vbo != i915_render->vbo ||
       i915->vbo_offset != i915_render->vbo_hw_offset) {
      i915->vbo = i915_render->vbo;
      i915->vbo_offset = i915_render->vbo_hw_offset;
      i915->dirty |= I915_NEW_VBO;
   }
}

/**
 * Callback exported to the draw module.
 * Returns the current vertex_info.
 *
 * Side effects:
 *    If state is dirty update derived state.
 */
static const struct vertex_info *
i915_vbuf_render_get_vertex_info(struct vbuf_render *render)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   struct i915_context *i915 = i915_render->i915;

   if (i915->dirty) {
      /* make sure we have up to date vertex layout */
      i915_update_derived(i915);
   }

   return &i915->current.vertex_info;
}

/**
 * Reserve space in the vbo for vertices.
 *
 * Side effects:
 *    None.
 */
static boolean
i915_vbuf_render_reserve(struct i915_vbuf_render *i915_render, size_t size)
{
   struct i915_context *i915 = i915_render->i915;

   if (i915_render->vbo_size < size + i915_render->vbo_sw_offset)
      return FALSE;

   if (i915->vbo_flushed)
      return FALSE;

   return TRUE;
}

/**
 * Allocate a new vbo buffer should there not be enough space for
 * the requested number of vertices by the draw module.
 *
 * Side effects:
 *    Updates hw_offset, sw_offset, index and allocates a new buffer.
 *    Will set i915->vbo to null on buffer allocation.
 */
static void
i915_vbuf_render_new_buf(struct i915_vbuf_render *i915_render, size_t size)
{
   struct i915_context *i915 = i915_render->i915;
   struct i915_winsys *iws = i915->iws;

   if (i915_render->vbo) {
      iws->buffer_unmap(iws, i915_render->vbo);
      iws->buffer_destroy(iws, i915_render->vbo);
      /*
       * XXX If buffers where referenced then this should be done in
       * update_vbo_state but since they arn't and malloc likes to reuse
       * memory we need to set it to null
       */
      i915->vbo = NULL;
      i915_render->vbo = NULL;
   }

   i915->vbo_flushed = 0;

   i915_render->vbo_size = MAX2(size, i915_render->vbo_alloc_size);
   i915_render->vbo_hw_offset = 0;
   i915_render->vbo_sw_offset = 0;
   i915_render->vbo_index = 0;

#ifndef VBUF_MAP_BUFFER
   if (i915_render->vbo_size > i915_render->map_size) {
      i915_render->map_size = i915_render->vbo_size;
      FREE(i915_render->vbo_ptr);
      i915_render->vbo_ptr = MALLOC(i915_render->map_size);
   }
#endif

   i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size,
                                         I915_NEW_VERTEX);
   i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, TRUE);
}

/**
 * Callback exported to the draw module.
 *
 * Side effects:
 *    Updates hw_offset, sw_offset, index and may allocate
 *    a new buffer. Also updates may update the vbo state
 *    on the i915 context.
 */
static boolean
i915_vbuf_render_allocate_vertices(struct vbuf_render *render,
                                   ushort vertex_size,
                                   ushort nr_vertices)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   size_t size = (size_t)vertex_size * (size_t)nr_vertices;
   size_t offset;

   /*
    * Align sw_offset with first multiple of vertex size from hw_offset.
    * Set index to be the multiples from from hw_offset to sw_offset.
    * i915_vbuf_render_new_buf will reset index, sw_offset, hw_offset
    * when it allocates a new buffer this is correct.
    */
   {
      offset = i915_render->vbo_sw_offset - i915_render->vbo_hw_offset;
      offset = util_align_npot(offset, vertex_size);
      i915_render->vbo_sw_offset = i915_render->vbo_hw_offset + offset;
      i915_render->vbo_index = offset / vertex_size;
   }

   if (!i915_vbuf_render_reserve(i915_render, size))
      i915_vbuf_render_new_buf(i915_render, size);

   /*
    * If a new buffer has been alocated sw_offset,
    * hw_offset & index will be reset by new_buf
    */

   i915_render->vertex_size = vertex_size;

   i915_vbuf_update_vbo_state(render);

   if (!i915_render->vbo)
      return FALSE;
   return TRUE;
}

static void *
i915_vbuf_render_map_vertices(struct vbuf_render *render)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   struct i915_context *i915 = i915_render->i915;

   if (i915->vbo_flushed)
      debug_printf("%s bad vbo flush occurred stalling on hw\n", __FUNCTION__);

#ifdef VBUF_MAP_BUFFER
   return (unsigned char *)i915_render->vbo_ptr + i915_render->vbo_sw_offset;
#else
   return (unsigned char *)i915_render->vbo_ptr;
#endif
}

static void
i915_vbuf_render_unmap_vertices(struct vbuf_render *render,
                                ushort min_index,
                                ushort max_index)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   struct i915_context *i915 = i915_render->i915;
   struct i915_winsys *iws = i915->iws;

   i915_render->vbo_max_index = max_index;
   i915_render->vbo_max_used = MAX2(i915_render->vbo_max_used, i915_render->vertex_size * (max_index + 1));
#ifdef VBUF_MAP_BUFFER
   (void)iws;
#else
   i915_render->map_used_start = i915_render->vertex_size * min_index;
   i915_render->map_used_end = i915_render->vertex_size * (max_index + 1);
   iws->buffer_write(iws, i915_render->vbo,
                     i915_render->map_used_start + i915_render->vbo_sw_offset,
                     i915_render->map_used_end - i915_render->map_used_start,
                     (unsigned char *)i915_render->vbo_ptr + i915_render->map_used_start);

#endif
}

/**
 * Ensure that the given max_index given is not larger ushort max.
 * If it is larger then ushort max it advanced the hw_offset to the
 * same position in the vbo as sw_offset and set index to zero.
 *
 * Side effects:
 *    On failure update hw_offset and index.
 */
static void
i915_vbuf_ensure_index_bounds(struct vbuf_render *render,
                              unsigned max_index)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);

   if (max_index + i915_render->vbo_index < ((1 << 17) - 1))
      return;

   i915_render->vbo_hw_offset = i915_render->vbo_sw_offset;
   i915_render->vbo_index = 0;

   i915_vbuf_update_vbo_state(render);
}

static void
i915_vbuf_render_set_primitive(struct vbuf_render *render, 
                               enum pipe_prim_type prim)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   i915_render->prim = prim;

   switch(prim) {
   case PIPE_PRIM_POINTS:
      i915_render->hwprim = PRIM3D_POINTLIST;
      i915_render->fallback = 0;
      break;
   case PIPE_PRIM_LINES:
      i915_render->hwprim = PRIM3D_LINELIST;
      i915_render->fallback = 0;
      break;
   case PIPE_PRIM_LINE_LOOP:
      i915_render->hwprim = PRIM3D_LINELIST;
      i915_render->fallback = PIPE_PRIM_LINE_LOOP;
      break;
   case PIPE_PRIM_LINE_STRIP:
      i915_render->hwprim = PRIM3D_LINESTRIP;
      i915_render->fallback = 0;
      break;
   case PIPE_PRIM_TRIANGLES:
      i915_render->hwprim = PRIM3D_TRILIST;
      i915_render->fallback = 0;
      break;
   case PIPE_PRIM_TRIANGLE_STRIP:
      i915_render->hwprim = PRIM3D_TRISTRIP;
      i915_render->fallback = 0;
      break;
   case PIPE_PRIM_TRIANGLE_FAN:
      i915_render->hwprim = PRIM3D_TRIFAN;
      i915_render->fallback = 0;
      break;
   case PIPE_PRIM_QUADS:
      i915_render->hwprim = PRIM3D_TRILIST;
      i915_render->fallback = PIPE_PRIM_QUADS;
      break;
   case PIPE_PRIM_QUAD_STRIP:
      i915_render->hwprim = PRIM3D_TRILIST;
      i915_render->fallback = PIPE_PRIM_QUAD_STRIP;
      break;
   case PIPE_PRIM_POLYGON:
      i915_render->hwprim = PRIM3D_POLY;
      i915_render->fallback = 0;
      break;
   default:
      /* FIXME: Actually, can handle a lot more just fine... */
      assert(0 && "unexpected prim in i915_vbuf_render_set_primitive()");
   }
}

/**
 * Used for fallbacks in draw_arrays
 */
static void
draw_arrays_generate_indices(struct vbuf_render *render,
                             unsigned start, uint nr,
                             unsigned type)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   struct i915_context *i915 = i915_render->i915;
   unsigned i;
   unsigned end = start + nr + i915_render->vbo_index;
   start += i915_render->vbo_index;

   switch(type) {
   case 0:
      for (i = start; i+1 < end; i += 2)
         OUT_BATCH((i+0) | (i+1) << 16);
      if (i < end)
         OUT_BATCH(i);
      break;
   case PIPE_PRIM_LINE_LOOP:
      if (nr >= 2) {
         for (i = start + 1; i < end; i++)
            OUT_BATCH((i-1) | (i+0) << 16);
         OUT_BATCH((i-1) | ( start) << 16);
      }
      break;
   case PIPE_PRIM_QUADS:
      for (i = start; i + 3 < end; i += 4) {
         OUT_BATCH((i+0) | (i+1) << 16);
         OUT_BATCH((i+3) | (i+1) << 16);
         OUT_BATCH((i+2) | (i+3) << 16);
      }
      break;
   case PIPE_PRIM_QUAD_STRIP:
      for (i = start; i + 3 < end; i += 2) {
         OUT_BATCH((i+0) | (i+1) << 16);
         OUT_BATCH((i+3) | (i+2) << 16);
         OUT_BATCH((i+0) | (i+3) << 16);
      }
      break;
   default:
      assert(0);
   }
}

static unsigned
draw_arrays_calc_nr_indices(uint nr, unsigned type)
{
   switch (type) {
   case 0:
      return nr;
   case PIPE_PRIM_LINE_LOOP:
      if (nr >= 2)
         return nr * 2;
      else
         return 0;
   case PIPE_PRIM_QUADS:
      return (nr / 4) * 6;
   case PIPE_PRIM_QUAD_STRIP:
      return ((nr - 2) / 2) * 6;
   default:
      assert(0);
      return 0;
   }
}

static void
draw_arrays_fallback(struct vbuf_render *render,
                     unsigned start,
                     uint nr)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   struct i915_context *i915 = i915_render->i915;
   unsigned nr_indices;

   nr_indices = draw_arrays_calc_nr_indices(nr, i915_render->fallback);
   if (!nr_indices)
      return;

   i915_vbuf_ensure_index_bounds(render, start + nr_indices);

   if (i915->dirty)
      i915_update_derived(i915);

   if (i915->hardware_dirty)
      i915_emit_hardware_state(i915);

   if (!BEGIN_BATCH(1 + (nr_indices + 1)/2)) {
      FLUSH_BATCH(NULL, I915_FLUSH_ASYNC);

      /* Make sure state is re-emitted after a flush:
       */
      i915_emit_hardware_state(i915);
      i915->vbo_flushed = 1;

      if (!BEGIN_BATCH(1 + (nr_indices + 1)/2)) {
         assert(0);
         goto out;
      }
   }

   OUT_BATCH(_3DPRIMITIVE |
             PRIM_INDIRECT |
             i915_render->hwprim |
             PRIM_INDIRECT_ELTS |
             nr_indices);

   draw_arrays_generate_indices(render, start, nr, i915_render->fallback);

out:
   return;
}

static void
i915_vbuf_render_draw_arrays(struct vbuf_render *render,
                             unsigned start,
                             uint nr)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   struct i915_context *i915 = i915_render->i915;

   if (i915_render->fallback) {
      draw_arrays_fallback(render, start, nr);
      return;
   }

   i915_vbuf_ensure_index_bounds(render, start + nr);
   start += i915_render->vbo_index;

   if (i915->dirty)
      i915_update_derived(i915);

   if (i915->hardware_dirty)
      i915_emit_hardware_state(i915);

   if (!BEGIN_BATCH(2)) {
      FLUSH_BATCH(NULL, I915_FLUSH_ASYNC);

      /* Make sure state is re-emitted after a flush:
       */
      i915_emit_hardware_state(i915);
      i915->vbo_flushed = 1;

      if (!BEGIN_BATCH(2)) {
         assert(0);
         goto out;
      }
   }

   OUT_BATCH(_3DPRIMITIVE |
             PRIM_INDIRECT |
             PRIM_INDIRECT_SEQUENTIAL |
             i915_render->hwprim |
             nr);
   OUT_BATCH(start); /* Beginning vertex index */

out:
   return;
}

/**
 * Used for normal and fallback emitting of indices
 * If type is zero normal operation assumed.
 */
static void
draw_generate_indices(struct vbuf_render *render,
                      const ushort *indices,
                      uint nr_indices,
                      unsigned type)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   struct i915_context *i915 = i915_render->i915;
   unsigned i;
   unsigned o = i915_render->vbo_index;

   switch(type) {
   case 0:
      for (i = 0; i + 1 < nr_indices; i += 2) {
         OUT_BATCH((o+indices[i]) | (o+indices[i+1]) << 16);
      }
      if (i < nr_indices) {
         OUT_BATCH((o+indices[i]));
      }
      break;
   case PIPE_PRIM_LINE_LOOP:
      if (nr_indices >= 2) {
         for (i = 1; i < nr_indices; i++)
            OUT_BATCH((o+indices[i-1]) | (o+indices[i]) << 16);
         OUT_BATCH((o+indices[i-1]) | (o+indices[0]) << 16);
      }
      break;
   case PIPE_PRIM_QUADS:
      for (i = 0; i + 3 < nr_indices; i += 4) {
         OUT_BATCH((o+indices[i+0]) | (o+indices[i+1]) << 16);
         OUT_BATCH((o+indices[i+3]) | (o+indices[i+1]) << 16);
         OUT_BATCH((o+indices[i+2]) | (o+indices[i+3]) << 16);
      }
      break;
   case PIPE_PRIM_QUAD_STRIP:
      for (i = 0; i + 3 < nr_indices; i += 2) {
         OUT_BATCH((o+indices[i+0]) | (o+indices[i+1]) << 16);
         OUT_BATCH((o+indices[i+3]) | (o+indices[i+2]) << 16);
         OUT_BATCH((o+indices[i+0]) | (o+indices[i+3]) << 16);
      }
      break;
   default:
      assert(0);
      break;
   }
}

static unsigned
draw_calc_nr_indices(uint nr_indices, unsigned type)
{
   switch (type) {
   case 0:
      return nr_indices;
   case PIPE_PRIM_LINE_LOOP:
      if (nr_indices >= 2)
         return nr_indices * 2;
      else
         return 0;
   case PIPE_PRIM_QUADS:
      return (nr_indices / 4) * 6;
   case PIPE_PRIM_QUAD_STRIP:
      return ((nr_indices - 2) / 2) * 6;
   default:
      assert(0);
      return 0;
   }
}

static void 
i915_vbuf_render_draw_elements(struct vbuf_render *render,
                               const ushort *indices,
                               uint nr_indices)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   struct i915_context *i915 = i915_render->i915;
   unsigned save_nr_indices;

   save_nr_indices = nr_indices;

   nr_indices = draw_calc_nr_indices(nr_indices, i915_render->fallback);
   if (!nr_indices)
      return;

   i915_vbuf_ensure_index_bounds(render, i915_render->vbo_max_index);

   if (i915->dirty)
      i915_update_derived(i915);

   if (i915->hardware_dirty)
      i915_emit_hardware_state(i915);

   if (!BEGIN_BATCH(1 + (nr_indices + 1)/2)) {
      FLUSH_BATCH(NULL, I915_FLUSH_ASYNC);

      /* Make sure state is re-emitted after a flush: 
       */
      i915_emit_hardware_state(i915);
      i915->vbo_flushed = 1;

      if (!BEGIN_BATCH(1 + (nr_indices + 1)/2)) {
         assert(0);
         goto out;
      }
   }

   OUT_BATCH(_3DPRIMITIVE |
             PRIM_INDIRECT |
             i915_render->hwprim |
             PRIM_INDIRECT_ELTS |
             nr_indices);
   draw_generate_indices(render,
                         indices,
                         save_nr_indices,
                         i915_render->fallback);

out:
   return;
}

static void
i915_vbuf_render_release_vertices(struct vbuf_render *render)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);

   i915_render->vbo_sw_offset += i915_render->vbo_max_used;
   i915_render->vbo_max_used = 0;

   /*
    * Micro optimization, by calling update here we the offset change
    * will be picked up on the next pipe_context::draw_*.
    */
   i915_vbuf_update_vbo_state(render);
}

static void
i915_vbuf_render_destroy(struct vbuf_render *render)
{
   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
   struct i915_context *i915 = i915_render->i915;
   struct i915_winsys *iws = i915->iws;

   if (i915_render->vbo) {
      i915->vbo = NULL;
      iws->buffer_unmap(iws, i915_render->vbo);
      iws->buffer_destroy(iws, i915_render->vbo);
   }

   FREE(i915_render);
}

/**
 * Create a new primitive render.
 */
static struct vbuf_render *
i915_vbuf_render_create(struct i915_context *i915)
{
   struct i915_vbuf_render *i915_render = CALLOC_STRUCT(i915_vbuf_render);
   struct i915_winsys *iws = i915->iws;
   int i;

   i915_render->i915 = i915;

   i915_render->base.max_vertex_buffer_bytes = 4*4096;

   /* NOTE: it must be such that state and vertices indices fit in a single 
    * batch buffer. 4096 is one batch buffer and 430 is the max amount of 
    * state in dwords. The result is the number of 16-bit indices which can
    * fit in a single batch buffer.
    */
   i915_render->base.max_indices = (4096 - 430 * 4) / 2;

   i915_render->base.get_vertex_info = i915_vbuf_render_get_vertex_info;
   i915_render->base.allocate_vertices = i915_vbuf_render_allocate_vertices;
   i915_render->base.map_vertices = i915_vbuf_render_map_vertices;
   i915_render->base.unmap_vertices = i915_vbuf_render_unmap_vertices;
   i915_render->base.set_primitive = i915_vbuf_render_set_primitive;
   i915_render->base.draw_elements = i915_vbuf_render_draw_elements;
   i915_render->base.draw_arrays = i915_vbuf_render_draw_arrays;
   i915_render->base.release_vertices = i915_vbuf_render_release_vertices;
   i915_render->base.destroy = i915_vbuf_render_destroy;

#ifndef VBUF_MAP_BUFFER
   i915_render->map_size = 0;
   i915_render->map_used_start = 0;
   i915_render->map_used_end = 0;
#endif

   i915_render->vbo = NULL;
   i915_render->vbo_ptr = NULL;
   i915_render->vbo_size = 0;
   i915_render->vbo_hw_offset = 0;
   i915_render->vbo_sw_offset = 0;
   i915_render->vbo_alloc_size = i915_render->base.max_vertex_buffer_bytes * 4;

#ifdef VBUF_USE_POOL
   i915_render->pool_used = FALSE;
   i915_render->pool_buffer_size = i915_render->vbo_alloc_size;
   i915_render->pool_fifo = u_fifo_create(6);
   for (i = 0; i < 6; i++)
      u_fifo_add(i915_render->pool_fifo,
                 iws->buffer_create(iws, i915_render->pool_buffer_size,
                                    I915_NEW_VERTEX));
#else
   (void)i;
   (void)iws;
#endif

   return &i915_render->base;
}

/**
 * Create a new primitive vbuf/render stage.
 */
struct draw_stage *i915_draw_vbuf_stage(struct i915_context *i915)
{
   struct vbuf_render *render;
   struct draw_stage *stage;
   
   render = i915_vbuf_render_create(i915);
   if (!render)
      return NULL;
   
   stage = draw_vbuf_stage(i915->draw, render);
   if (!stage) {
      render->destroy(render);
      return NULL;
   }
   /** TODO JB: this shouldn't be here */
   draw_set_render(i915->draw, render);

   return stage;
}