// -*- C++ -*-

// Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library.  This library is free
// software; you can redistribute it and/or modify it under the terms
// of the GNU General Public License as published by the Free Software
// Foundation; either version 3, or (at your option) any later
// version.

// This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// General Public License for more details.

// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.

// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
// <http://www.gnu.org/licenses/>.

/** @file parallel/partial_sum.h
 *  @brief Parallel implementation of std::partial_sum(), i. e. prefix
 *  sums.
 *  This file is a GNU parallel extension to the Standard C++ Library.
 */

// Written by Johannes Singler.

#ifndef _GLIBCXX_PARALLEL_PARTIAL_SUM_H
#define _GLIBCXX_PARALLEL_PARTIAL_SUM_H 1

#include <omp.h>
#include <new>
#include <bits/stl_algobase.h>
#include <parallel/parallel.h>
#include <parallel/numericfwd.h>

namespace __gnu_parallel
{
  // Problem: there is no 0-element given.

/** @brief Base case prefix sum routine.
  *  @param begin Begin iterator of input sequence.
  *  @param end End iterator of input sequence.
  *  @param result Begin iterator of output sequence.
  *  @param bin_op Associative binary function.
  *  @param value Start value. Must be passed since the neutral
  *  element is unknown in general.
  *  @return End iterator of output sequence. */
template<typename InputIterator,
	 typename OutputIterator,
	 typename BinaryOperation>
  OutputIterator
  parallel_partial_sum_basecase(InputIterator begin, InputIterator end,
				OutputIterator result, BinaryOperation bin_op,
				typename std::iterator_traits
				<InputIterator>::value_type value)
  {
    if (begin == end)
      return result;

    while (begin != end)
      {
        value = bin_op(value, *begin);
        *result = value;
        ++result;
        ++begin;
      }
    return result;
  }

/** @brief Parallel partial sum implementation, two-phase approach,
    no recursion.
    *  @param begin Begin iterator of input sequence.
    *  @param end End iterator of input sequence.
    *  @param result Begin iterator of output sequence.
    *  @param bin_op Associative binary function.
    *  @param n Length of sequence.
    *  @param num_threads Number of threads to use.
    *  @return End iterator of output sequence.
    */
template<typename InputIterator,
	 typename OutputIterator,
	 typename BinaryOperation>
  OutputIterator
  parallel_partial_sum_linear(InputIterator begin, InputIterator end,
			      OutputIterator result, BinaryOperation bin_op,
			      typename std::iterator_traits
			      <InputIterator>::difference_type n)
  {
    typedef std::iterator_traits<InputIterator> traits_type;
    typedef typename traits_type::value_type value_type;
    typedef typename traits_type::difference_type difference_type;

    if (begin == end)
      return result;

    thread_index_t num_threads =
        std::min<difference_type>(get_max_threads(), n - 1);

    if (num_threads < 2)
      {
        *result = *begin;
        return parallel_partial_sum_basecase(
            begin + 1, end, result + 1, bin_op, *begin);
      }

    difference_type* borders;
    value_type* sums;

    const _Settings& __s = _Settings::get();

#   pragma omp parallel num_threads(num_threads)
      {
#       pragma omp single
          {
            num_threads = omp_get_num_threads();

            borders = new difference_type[num_threads + 2];

            if (__s.partial_sum_dilation == 1.0f)
              equally_split(n, num_threads + 1, borders);
            else
              {
                difference_type chunk_length =
                    ((double)n
		     / ((double)num_threads + __s.partial_sum_dilation)),
		  borderstart = n - num_threads * chunk_length;
                borders[0] = 0;
                for (int i = 1; i < (num_threads + 1); ++i)
                  {
                    borders[i] = borderstart;
                    borderstart += chunk_length;
                  }
                borders[num_threads + 1] = n;
              }

            sums = static_cast<value_type*>(::operator new(sizeof(value_type)
							   * num_threads));
            OutputIterator target_end;
          } //single

        thread_index_t iam = omp_get_thread_num();
        if (iam == 0)
          {
            *result = *begin;
            parallel_partial_sum_basecase(begin + 1, begin + borders[1],
					  result + 1, bin_op, *begin);
            ::new(&(sums[iam])) value_type(*(result + borders[1] - 1));
          }
        else
          {
            ::new(&(sums[iam]))
	      value_type(std::accumulate(begin + borders[iam] + 1,
					 begin + borders[iam + 1],
					 *(begin + borders[iam]),
					 bin_op,
					 __gnu_parallel::sequential_tag()));
          }

#       pragma omp barrier

#       pragma omp single
          parallel_partial_sum_basecase(
              sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);

#       pragma omp barrier

        // Still same team.
        parallel_partial_sum_basecase(begin + borders[iam + 1],
				      begin + borders[iam + 2],
				      result + borders[iam + 1], bin_op,
				      sums[iam]);
      } //parallel

    ::operator delete(sums);
    delete[] borders;

    return result + n;
  }

/** @brief Parallel partial sum front-end.
  *  @param begin Begin iterator of input sequence.
  *  @param end End iterator of input sequence.
  *  @param result Begin iterator of output sequence.
  *  @param bin_op Associative binary function.
  *  @return End iterator of output sequence. */
template<typename InputIterator,
	 typename OutputIterator,
	 typename BinaryOperation>
  OutputIterator
  parallel_partial_sum(InputIterator begin, InputIterator end,
                       OutputIterator result, BinaryOperation bin_op)
  {
    _GLIBCXX_CALL(begin - end)

    typedef std::iterator_traits<InputIterator> traits_type;
    typedef typename traits_type::value_type value_type;
    typedef typename traits_type::difference_type difference_type;

    difference_type n = end - begin;

    switch (_Settings::get().partial_sum_algorithm)
      {
      case LINEAR:
        // Need an initial offset.
        return parallel_partial_sum_linear(begin, end, result, bin_op, n);
      default:
    // Partial_sum algorithm not implemented.
        _GLIBCXX_PARALLEL_ASSERT(0);
        return result + n;
      }
  }
}

#endif /* _GLIBCXX_PARALLEL_PARTIAL_SUM_H */