C++程序  |  335行  |  10.51 KB

/******************************************************************************
**	Filename:	cntraining.cpp
**	Purpose:	Generates a normproto and pffmtable.
**	Author:		Dan Johnson
**	Revisment:	Christy Russon
**	History:     Fri Aug 18 08:53:50 1989, DSJ, Created.
**		     5/25/90, DSJ, Adapted to multiple feature types.
**				Tuesday, May 17, 1998 Changes made to make feature specific and
**				simplify structures. First step in simplifying training process.
**
 **	(c) Copyright Hewlett-Packard Company, 1988.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
******************************************************************************/


/**----------------------------------------------------------------------------
					Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "oldlist.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "tessopt.h"
#include "ocrfeatures.h"
#include "general.h"
#include "clusttool.h"
#include "cluster.h"
#include "name2char.h"
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "unichar.h"
#include "commontraining.h"

#define PROGRAM_FEATURE_TYPE "cn"
#define MINSD (1.0f / 64.0f)

int	row_number;						/* cjn: fixes link problem */

/**----------------------------------------------------------------------------
					Public Function Prototypes
----------------------------------------------------------------------------**/
int main (
     int	argc,
     char	**argv);

/**----------------------------------------------------------------------------
					Private Function Prototypes
----------------------------------------------------------------------------**/
void ReadTrainingSamples (
     FILE	*File,
	 LIST* TrainingSamples);

void WriteNormProtos (
     char	*Directory,
     LIST	LabeledProtoList,
	 CLUSTERER *Clusterer);

/*
PARAMDESC *ConvertToPARAMDESC(
	PARAM_DESC* Param_Desc,
	int N);
*/

void WriteProtos(
     FILE	*File,
     uinT16	N,
     LIST	ProtoList,
     BOOL8	WriteSigProtos,
     BOOL8	WriteInsigProtos);

/**----------------------------------------------------------------------------
		  		Global Data Definitions and Declarations
----------------------------------------------------------------------------**/
/* global variable to hold configuration parameters to control clustering */
//-M 0.025   -B 0.05   -I 0.8   -C 1e-3
CLUSTERCONFIG	Config =
{
  elliptical, 0.025, 0.05, 0.8, 1e-3, 0
};


/**----------------------------------------------------------------------------
							Public Code
----------------------------------------------------------------------------**/
/*---------------------------------------------------------------------------*/
int main (
     int	argc,
     char	**argv)

/*
**	Parameters:
**		argc	number of command line arguments
**		argv	array of command line arguments
**	Globals: none
**	Operation:
**		This program reads in a text file consisting of feature
**		samples from a training page in the following format:
**
**			FontName CharName NumberOfFeatureTypes(N)
**			   FeatureTypeName1 NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			   FeatureTypeName2 NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			   ...
**			   FeatureTypeNameN NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			FontName CharName ...
**
**		It then appends these samples into a separate file for each
**		character.  The name of the file is
**
**			DirectoryName/FontName/CharName.FeatureTypeName
**
**		The DirectoryName can be specified via a command
**		line argument.  If not specified, it defaults to the
**		current directory.  The format of the resulting files is:
**
**			NumberOfFeatures(M)
**			   Feature1
**			   ...
**			   FeatureM
**			NumberOfFeatures(M)
**			...
**
**		The output files each have a header which describes the
**		type of feature which the file contains.  This header is
**		in the format required by the clusterer.  A command line
**		argument can also be used to specify that only the first
**		N samples of each class should be used.
**	Return: none
**	Exceptions: none
**	History: Fri Aug 18 08:56:17 1989, DSJ, Created.
*/

{
	char	*PageName;
	FILE	*TrainingPage;
	LIST	CharList = NIL;
	CLUSTERER	*Clusterer = NULL;
	LIST		ProtoList = NIL;
	LIST		NormProtoList = NIL;
	LIST pCharList;
	LABELEDLIST CharSample;

	ParseArguments (argc, argv);
	while ((PageName = GetNextFilename(argc, argv)) != NULL)
	{
		printf ("Reading %s ...\n", PageName);
		TrainingPage = Efopen (PageName, "r");
		ReadTrainingSamples (TrainingPage, &CharList);
		fclose (TrainingPage);
		//WriteTrainingSamples (Directory, CharList);
	}
        printf("Clustering ...\n");
	pCharList = CharList;
	iterate(pCharList)
	{
          //Cluster
          CharSample = (LABELEDLIST) first_node (pCharList);
          //printf ("\nClustering %s ...", CharSample->Label);
          Clusterer = SetUpForClustering(CharSample, PROGRAM_FEATURE_TYPE);
          float SavedMinSamples = Config.MinSamples;
          Config.MagicSamples = CharSample->SampleCount;
          while (Config.MinSamples > 0.001) {
            ProtoList = ClusterSamples(Clusterer, &Config);
            if (NumberOfProtos(ProtoList, 1, 0) > 0)
              break;
            else {
              Config.MinSamples *= 0.95;
              printf("0 significant protos for %s."
                     " Retrying clustering with MinSamples = %f%%\n",
                     CharSample->Label, Config.MinSamples);
            }
          }
          Config.MinSamples = SavedMinSamples;
          AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
	}
	FreeTrainingSamples (CharList);
        if (Clusterer == NULL) // To avoid a SIGSEGV
          return 1;
	WriteNormProtos (Directory, NormProtoList, Clusterer);
	FreeClusterer(Clusterer);
	FreeProtoList(&ProtoList);
	FreeNormProtoList(NormProtoList);
	printf ("\n");
  return 0;
}	// main


/**----------------------------------------------------------------------------
							Private Code
----------------------------------------------------------------------------**/
/*---------------------------------------------------------------------------*/
void ReadTrainingSamples (
     FILE	*File,
	 LIST* TrainingSamples)

/*
**	Parameters:
**		File		open text file to read samples from
**	Globals: none
**	Operation:
**		This routine reads training samples from a file and
**		places them into a data structure which organizes the
**		samples by FontName and CharName.  It then returns this
**		data structure.
**	Return: none
**	Exceptions: none
**	History: Fri Aug 18 13:11:39 1989, DSJ, Created.
**			 Tue May 17 1998 simplifications to structure, illiminated
**				font, and feature specification levels of structure.
*/

{
	char		unichar[UNICHAR_LEN + 1];
	LABELEDLIST	CharSample;
	FEATURE_SET	FeatureSamples;
	CHAR_DESC	CharDesc;
	int		Type, i;

	while (fscanf (File, "%s %s", CTFontName, unichar) == 2) {
          CharSample = FindList (*TrainingSamples, unichar);
          if (CharSample == NULL) {
            CharSample = NewLabeledList (unichar);
            *TrainingSamples = push (*TrainingSamples, CharSample);
          }
          CharDesc = ReadCharDescription (File);
          Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE);
          FeatureSamples = CharDesc->FeatureSets[Type];
          for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
            FEATURE f = FeatureSamples->Features[feature];
            for (int dim =0; dim < f->Type->NumParams; ++dim)
              f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
          }
          CharSample->List = push (CharSample->List, FeatureSamples);
          CharSample->SampleCount++;
          for (i = 0; i < CharDesc->NumFeatureSets; i++)
            if (Type != i)
              FreeFeatureSet(CharDesc->FeatureSets[i]);
          free (CharDesc);
        }
}	// ReadTrainingSamples

/*----------------------------------------------------------------------------*/
void WriteNormProtos (
     char	*Directory,
     LIST	LabeledProtoList,
	 CLUSTERER *Clusterer)

/*
**	Parameters:
**		Directory	directory to place sample files into
**	Operation:
**		This routine writes the specified samples into files which
**		are organized according to the font name and character name
**		of the samples.
**	Return: none
**	Exceptions: none
**	History: Fri Aug 18 16:17:06 1989, DSJ, Created.
*/

{
	FILE		*File;
	char		Filename[MAXNAMESIZE];
	LABELEDLIST LabeledProto;
	int N;

	strcpy (Filename, "");
	if (Directory != NULL)
	{
		strcat (Filename, Directory);
		strcat (Filename, "/");
	}
	strcat (Filename, "normproto");
	printf ("\nWriting %s ...", Filename);
	File = Efopen (Filename, "w");
	fprintf(File,"%0d\n",Clusterer->SampleSize);
	WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
	iterate(LabeledProtoList)
	{
		LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
		N = NumberOfProtos(LabeledProto->List,
		ShowSignificantProtos, ShowInsignificantProtos);
                if (N < 1) {
                  printf ("\nError! Not enough protos for %s: %d protos"
                          " (%d significant protos"
                          ", %d insignificant protos)\n",
                          LabeledProto->Label, N,
                          NumberOfProtos(LabeledProto->List, 1, 0),
                          NumberOfProtos(LabeledProto->List, 0, 1));
                  exit(1);
                }
		fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
		WriteProtos(File, Clusterer->SampleSize, LabeledProto->List,
			ShowSignificantProtos, ShowInsignificantProtos);
	}
	fclose (File);

}	// WriteNormProtos

/*-------------------------------------------------------------------------*/
void WriteProtos(
     FILE	*File,
     uinT16	N,
     LIST	ProtoList,
     BOOL8	WriteSigProtos,
     BOOL8	WriteInsigProtos)
{
	PROTOTYPE	*Proto;

	// write prototypes
	iterate(ProtoList)
	{
		Proto = (PROTOTYPE *) first_node ( ProtoList );
		if (( Proto->Significant && WriteSigProtos )	||
			( ! Proto->Significant && WriteInsigProtos ) )
			WritePrototype( File, N, Proto );
	}
}	// WriteProtos