/*
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
char tdir[256];
char path[256];
long stats = 0;

void print_usage()
{
	printf("
This program creates files in a tree of random depth and branching. Files vary
in size randomly according to a distribution function which seems to model real
file systems.  This distribution function has a median size of median_file_size
(Median file size is hypothesized to be proportional to the average per file
space wastage. Notice how that implies that with a more efficient file system
file size usage patterns will in the long term move to a lower median file
size), and a maximum size of max_file_size.  Directories vary in size according
to the same distribution function but with separate parameters to control median
and maximum size for the number of files within them, and the number of
subdirectories within them.  This program prunes some empty subdirectories in a
way that causes parents of leaf directories to branch less than
median_dir_branching.

 To avoid having one large file distort the results such that you have
to benchmark many times set max_file_size to not more than
bytes_to_consume/10.  If maximum/median is a small integer, then
randomness is very poor.  This is a bug, Nikita, please find some
clever way to fix it.  If it is 0, then the program crashes....

For isolating performance consequences of design variations on
particular file or directory size ranges, try setting their median size and
max_size to both equal the max size of the file size range you want
to test.

To avoid having one large file distort the results set max_file_size to
not more than bytes_to_consume/10.  Using a distribution function for
the sizes of writes would be a natural next step in developing this program.\n\n");

	printf
	    ("Usage: reiser_fract_tree bytes_to_consume median_file_size max_file_size median_dir_nr_files max_directory_nr_files median_dir_branching max_dir_branching write_buffer_size /testfs_mount_point print_stats_flag\n\n");
}

/* #define DEBUG */

char *write_buffer;		/* buffer from which we write */
int write_buffer_size = 0;	/* gets reset to an argv  */
static int already_whined = 0;	/* keep out of disk space errors from being
				   endless by tracking whether we already
				   printed the message */
long bytes_to_consume = 0;	/* create files until their total number of
				   bytes exceeds this number, but not by more
				   than 1/10th */
long byte_total = 0;		/* bytes created so far */

/* statistics on sizes of files we attempted to create */
int fsz_0_100 = 0;
int fsz_100_1k = 0;
int fsz_1k_10k = 0;
int fsz_10k_100k = 0;
int fsz_100k_1m = 0;
int fsz_1m_10m = 0;
int fsz_10m_larger = 0;

void chngdir(char *name)
{
	int i;

	if (name[0] == '.' && name[1] == '.') {
		for (i = strlen(path); i > 0; i--) {
			if (path[i] == '/') {
				path[i] = 0;
				break;
			}
		}
	} else {
		strcat(path, "/");
		strcat(path, name);
	}
}

/* this is the core statistical distribution function, and it is used for file
   sizes, directory sizes, etc. */
int determine_size(double median_size,
		   double max_size /* The maximal value of size */ )
{
	/* when x is half of its random range (max_size/median_size), result is
	   median_size */
	int nr_random, granularity_reducer;
	double size, double_nr_random;

	/* it is a feature for us that this repeats identically every time it is run,
	   as otherwise meaningless variances would affect our results and require us
	   to use a higher number of benchmarks to achieve low noise results.  */
	nr_random = rand();
	median_size++;		/* avoids divide by zero errors */

	/* this code does poorly when max_size is not a lot more than median size,
	   and that needs fixing */

	/* THE NEXT 2 LINES ARE THE HEART OF THE PROGRAM */

	/* keep x below the value that when multiplied by median size on the next
	   line will equal max_size */
	/* the granularity_reducer is to handle the case where max_size is near
	   median_size, since '%' can only take ints, we need this complicated what
	   of handling that for small values of max_size/median_size by making
	   large ints out of small ints temporarily.  */
	if (max_size / median_size < 1024)
		granularity_reducer = 1024 * 1024;
	else
		granularity_reducer = 1;
	nr_random =
	    nr_random %
	    ((int)
	     (granularity_reducer *
	      (((double)max_size) / ((double)median_size))));
	double_nr_random = ((double)nr_random) / (granularity_reducer);
	size =
	    median_size * (1 /
			   (1 -
			    (double_nr_random) / (((double)max_size) /
						  ((double)median_size))) - 1);
	return ((int)size);
}

/* generate a unique filename */
void get_name_by_number(long this_files_number, char *str)
{
	sprintf(str, "%lu", this_files_number);
}

/* make a file of a specified size */
void make_file(int size)
{
	char string[128] = { 0 };
	char *str = string;
	char fname[256];
	int fd = 0;
	int error;
	static long this_files_number = 1;

	/* collect statistics about the size of files created, or more precisely, the
	   size of files that we will attempt to create. */
	if (size <= 100)
		fsz_0_100++;
	else if (size <= 1000)
		fsz_100_1k++;
	else if (size <= 10 * 1000)
		fsz_1k_10k++;
	else if (size <= 100 * 1000)
		fsz_10k_100k++;
	else if (size <= 1000 * 1000)
		fsz_100k_1m++;
	else if (size <= 10 * 1000 * 1000)
		fsz_1m_10m++;
	else
		fsz_10m_larger++;

	/* construct a name for the file */
	get_name_by_number(this_files_number++, str);
	strcpy(fname, path);
	strcat(fname, "/");
	strcat(fname, str);

	/* open the file, and deal with the various errors that can occur */

	if ((fd = open(fname, O_CREAT | O_EXCL | O_RDWR, 0777)) == -1) {
		if (errno == ENOSPC) {
			if (!already_whined) {
				printf
				    ("reiser-2021A: out of disk (or inodes) space, will keep trying\n");
				already_whined = 1;	/* we continue other file creation in out of
							   space conditions */
			}
			return;
		}
		/*  it is sometimes useful to be able to run this program more than once
		   inside the same directory, and that means skipping over filenames that
		   already exist.  Thus we ignore EEXIST, and pay attention to all
		   else. */
		if (errno == EEXIST) {	/* just skip existing file */
			return;
		}
		perror("open");
		exit(errno);
	}
	/* write to the file until it is the right size, handling the various error
	   conditions appropriately */

	while (size > 0) {
		size -= (error =
			 write(fd, write_buffer,
			       (size <
				write_buffer_size -
				1) ? size : (write_buffer_size - 1)));
		if (error == -1) {
			if (errno == ENOSPC) {
				if (!already_whined) {
					printf
					    ("reiser-2022: out of disk space, will keep trying\n");
					already_whined = 1;
				}
				close(fd);
				return;
			}
			perror("write() failed");
			exit(errno);
		}
	}

	/* close the file */
	if (close(fd)) {
		perror("close() failed");
		exit(errno);
	}
}

/* print the statistics on how many files were created of what size */

void print_stats()
{
	if (!stats)
		return;

	printf("\n");
	printf("File stats: Units are decimal (1k = 1000)\n");
	printf("files 0-100    : %i\n", fsz_0_100);
	printf("files 100-1K   : %i\n", fsz_100_1k);
	printf("files 1K-10K   : %i\n", fsz_1k_10k);
	printf("files 10K-100K : %i\n", fsz_10k_100k);
	printf("files 100K-1M  : %i\n", fsz_100k_1m);
	printf("files 1M-10M    : %i\n", fsz_1m_10m);
	printf("files 10M-larger    : %i\n", fsz_10m_larger);
	printf("total bytes written    : %lu\n", byte_total);

}

/* predict the number of files that will be created before max_bytes total
   length of files is reached */
long determine_nr_of_files(int median_file_size, double max_file_size,
			   long bytes_to_consume)
{
	long nr_of_files = 0, byte_total = 0;

	/* the next line is not necessary as 1 is the default, it is just cautious
	   coding */
	srand(1);
	while (byte_total < bytes_to_consume) {
		byte_total += determine_size(median_file_size, max_file_size);
		nr_of_files++;
	}
	/* reset the random number generator so that when we determine_size() of the
	   files later they will be created with the same "random" sequence used in
	   this calculation */
	srand(1);
#ifdef DEBUG
	printf("number of files is %d\n", (int)nr_of_files);
#endif /* DEBUG */
	fflush(NULL);
	return nr_of_files;
}

/* fill the current working directory with nr_files_this_directory number of
   files*/

void fill_this_directory(long nr_files_this_directory, long median_file_size,
			 long maximum_size)
{
	long size;

#ifdef DEBUG
	printf("filling with %lu files, ", nr_files_this_directory);
#endif
	while (nr_files_this_directory--) {
		size = determine_size(median_file_size, maximum_size);
		byte_total += size;
		make_file(size);
	}
}

/* this will unfortunately handle out of disk space by forever trying */
/* What we should do in out of space situaltion ? I think we must skip this
   directory and continue files/dirs creation process. Error value (!= 0)
   indicates that we can't go to this directory. -zam */
int make_directory(char *dirname)
{
	static long this_directory_number = 0;

	strcpy(tdir, path);
	strcat(tdir, "/");
	strcat(tdir, dirname);

	if (mkdir(tdir, 0755) == -1) {
		if (errno == ENOSPC) {
			if (!already_whined) {
				printf("reiser-2021: out of disk space, ");
				already_whined = 1;
			}
			return errno;
		}
		/*  it is sometimes useful to be able to run this program more than once
		   inside the same directory, and that means skipping over filenames that
		   already exist.  Thus we ignore EEXIST, and pay attention to all else. */
		if (errno != EEXIST) {
			perror("mkdir");
			exit(errno);
		}
	}
	sprintf(dirname, "d%lu", this_directory_number++);
	strcpy(tdir, path);
	strcat(tdir, "/");
	strcat(tdir, dirname);

	return 0;
}

/* assumes we are already chdir'd into a directory that the subtree is rooted
   at.  Fills the directory with files and subdirectories, cd's into those
   subdirectories, and recurses upon itself */

void do_subtree(
		       /* the start and end of the portion of the directory sizes
		          array which corresponds to the sizes of the directories
		          composing this subtree */
		       /* sizes_end minus sizes_start is equal to the number of
		          directories in this subtree */
		       long *sizes_start, long *sizes_end,
		       long median_file_size, long maximum_file_size,
		       long median_dir_branching, long max_dir_branching)
{
	long *p;
	long *sub_start;
	long *sub_end;
	int index_subdirectory_to_add_directory_to;
	long *dirs_in_subtrees;
	char *subtree_name;
	long *sizes_index = sizes_start;
	char subtree_name_array[128];
	long this_directory_branching;
	static long this_directorys_number;

	subtree_name = subtree_name_array;
	/* fill this directory with its number of files */
	fill_this_directory(*sizes_index, median_file_size, maximum_file_size);
	sizes_index++;
	/* ok, now randomly assign directories (and their number of files) among the
	   subdirectories that will be created if at least one directory is assigned
	   to it */

	/* this will cause the random number sequence to not match the one used in
	   determine_nr_files() I need to accumulate my values in an array
	   beforehand. I'll code that later.  */
	/* worry about whether 0 or 1 is a problem value */
	this_directory_branching =
	    determine_size(median_dir_branching, max_dir_branching) + 1;

	/* create an array holding the number of directories assigned to each
	   potential subdirectory */
	dirs_in_subtrees = calloc(this_directory_branching, sizeof(long));
	while (sizes_index <= sizes_end) {
		index_subdirectory_to_add_directory_to =
		    (rand() % this_directory_branching);
		(*
		 (dirs_in_subtrees + index_subdirectory_to_add_directory_to))++;
		sizes_index++;
	}
	/* the +1 is for the fill_directory() we did above */
	sizes_index = sizes_start + 1;

	/* go through each potential subdirectory, and if at least one directory has
	   been assigned to it, create it and recurse */
	for (p = dirs_in_subtrees;
	     p < (dirs_in_subtrees + this_directory_branching); p++) {
		if (*p) {
			int nocd;
			sprintf(subtree_name, "d%lu", this_directorys_number++);
			nocd = make_directory(subtree_name);
			/* if make_dir.. may fails (in out of space situation), we continue
			   creation process in same dir */
			if (!nocd)
				chngdir(subtree_name);
			sub_start = sizes_index;
			/* the minus one is because *p is the number of elements and arrays start at 0 */
			sub_end = (sizes_index + (*p - 1));

#ifdef DEBUG
			/* comment this back in if the array logic has you going cross-eyed */
			/*      printf ("sizes_start is %p, sizes_index is %p, sizes_index+p is %p, sizes_end is %p\n", sizes_start, sub_start, sub_end, sizes_end); */
#endif
			do_subtree(sub_start, sub_end, median_file_size,
				   maximum_file_size, median_dir_branching,
				   max_dir_branching);
			if (!nocd)
				chngdir("..");
		}
		sizes_index += *p;
	}
}

/* We have already determined that nr_files can fit in bytes_to_consume space.
   Fill the sizes array with the number of files to be in each directory, and
   then call do_subtree to fill the tree with files and directories.  */

void make_fractal_tree(long median_file_size, long maximum_file_size,
		       long median_dir_nr_files, long max_dir_nr_files,
		       long median_dir_branching, long max_dir_branching,
		       long nr_files)
{
	long *sizes_start;
	long *sizes_end;
	long *sizes_index;
	long remaining_files = nr_files;

	/* collect together array of directory sizes for whole filesystem.  This
	   cannot easily be done recursively without distorting the directory sizes
	   and making deeper directories smaller.  Send me the code if you
	   disagree.:-) */
	/* we almost certainly don't need this much space, but so what.... */
	sizes_index = sizes_start = malloc(nr_files * sizeof(long));
	for (; remaining_files > 0;) {
		*sizes_index =
		    determine_size(median_dir_nr_files, max_dir_nr_files);
		// we alloc space for nr_files, so we should avoid
		// number of files in directory = 0 -grev.
		if (*sizes_index == 0)
			*sizes_index = 1;
		*sizes_index =
		    (*sizes_index <
		     remaining_files) ? *sizes_index : remaining_files;

#ifdef DEBUG
		printf("*sizes_index == %lu, ", *sizes_index);
#endif
		remaining_files -= *sizes_index;
		sizes_index++;
	}
	/* don't decrement below sizes_start if nr_files is 0 */
	sizes_end = (sizes_index-- > sizes_start) ? sizes_index : sizes_start;

	sizes_index = sizes_start;
	srand(1);
	do_subtree(sizes_start, sizes_end, median_file_size, maximum_file_size,
		   median_dir_branching, max_dir_branching);

}

int main(int argc, char *argv[])
{
	/* initialized from argv[] */
	long median_file_size,
	    median_dir_branching,
	    median_dir_nr_files,
	    max_dir_nr_files, max_dir_branching, max_file_size;
	long nr_of_files = 0;	/* files to be created */

	if (argc != 11) {
		print_usage();
		exit(1);
	}

	write_buffer_size = atoi(argv[8]);
	write_buffer = malloc(write_buffer_size);
	memset(write_buffer, 'a', write_buffer_size);

	/* the number of bytes that we desire this tree to consume.  It will actually
	   consume more, because the last file will overshoot by a random amount, and
	   because the directories and metadata will consume space.  */
	bytes_to_consume = atol(argv[1]);
	max_file_size = atol(argv[3]);
	median_file_size = atol(argv[2]);
	/* Figure out how many random files will fit into bytes_to_consume bytes. We
	   depend on resetting rand() to get the same result later. */
	nr_of_files =
	    determine_nr_of_files(median_file_size, max_file_size,
				  bytes_to_consume);

	strcpy(path, argv[9]);
	mkdir(path, 0755);
	stats = atol(argv[10]);
	median_dir_branching = atol(argv[6]);
	max_dir_branching = atol(argv[7]);
	median_dir_nr_files = atol(argv[4]);
	max_dir_nr_files = atol(argv[5]);
	make_fractal_tree(median_file_size, max_file_size, median_dir_nr_files,
			  max_dir_nr_files, median_dir_branching,
			  max_dir_branching, nr_of_files);
	print_stats();
	if (stats)
		printf("\nreiser_fract_tree finished\n");

	return 0;
}