#include <language_model.h>

Collaboration diagram for tesseract::LanguageModel:

[legend]

Public Member Functions
	LanguageModel (const UnicityTable< FontInfo > fontinfo_table, Dict dict)

	~LanguageModel ()

void	InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)

bool	UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST curr_list, LanguageModelState parent_node, LMPainPoints pain_points, WERD_RES word_res, BestChoiceBundle best_choice_bundle, BlamerBundle blamer_bundle)

bool	AcceptableChoiceFound ()

void	SetAcceptableChoiceFound (bool val)

ParamsModel &	getParamsModel ()

	INT_VAR_H (language_model_debug_level, 0, "Language model debug level")

	BOOL_VAR_H (language_model_ngram_on, false, "Turn on/off the use of character ngram model")

	INT_VAR_H (language_model_ngram_order, 8, "Maximum order of the character ngram model")

	INT_VAR_H (language_model_viterbi_list_max_num_prunable, 10, "Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs")

	INT_VAR_H (language_model_viterbi_list_max_size, 500, "Maximum size of viterbi lists recorded in BLOB_CHOICEs")

	double_VAR_H (language_model_ngram_small_prob, 0.000001, "To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model")

	double_VAR_H (language_model_ngram_nonmatch_score, -40.0, "Average classifier score of a non-matching unichar")

	BOOL_VAR_H (language_model_ngram_use_only_first_uft8_step, false, "Use only the first UTF8 step of the given string" " when computing log probabilities")

	double_VAR_H (language_model_ngram_scale_factor, 0.03, "Strength of the character ngram model relative to the" " character classifier ")

	double_VAR_H (language_model_ngram_rating_factor, 16.0, "Factor to bring log-probs into the same range as ratings" " when multiplied by outline length ")

	BOOL_VAR_H (language_model_ngram_space_delimited_language, true, "Words are delimited by space")

	INT_VAR_H (language_model_min_compound_length, 3, "Minimum length of compound words")

	double_VAR_H (language_model_penalty_non_freq_dict_word, 0.1, "Penalty for words not in the frequent word dictionary")

	double_VAR_H (language_model_penalty_non_dict_word, 0.15, "Penalty for non-dictionary words")

	double_VAR_H (language_model_penalty_punc, 0.2, "Penalty for inconsistent punctuation")

	double_VAR_H (language_model_penalty_case, 0.1, "Penalty for inconsistent case")

	double_VAR_H (language_model_penalty_script, 0.5, "Penalty for inconsistent script")

	double_VAR_H (language_model_penalty_chartype, 0.3, "Penalty for inconsistent character type")

	double_VAR_H (language_model_penalty_font, 0.00, "Penalty for inconsistent font")

	double_VAR_H (language_model_penalty_spacing, 0.05, "Penalty for inconsistent spacing")

	double_VAR_H (language_model_penalty_increment, 0.01, "Penalty increment")

	INT_VAR_H (wordrec_display_segmentations, 0, "Display Segmentations")

	BOOL_VAR_H (language_model_use_sigmoidal_certainty, false, "Use sigmoidal score for certainty")

Static Public Member Functions
static void	ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])

Static Public Attributes
static const LanguageModelFlagsType	kSmallestRatingFlag = 0x1

static const LanguageModelFlagsType	kLowerCaseFlag = 0x2

static const LanguageModelFlagsType	kUpperCaseFlag = 0x4

static const LanguageModelFlagsType	kDigitFlag = 0x8

static const LanguageModelFlagsType	kXhtConsistentFlag = 0x10

static const float	kMaxAvgNgramCost = 25.0f

Protected Member Functions
float	CertaintyScore (float cert)

float	ComputeAdjustment (int num_problems, float penalty)

float	ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)

float	ComputeAdjustedPathCost (ViterbiStateEntry *vse)

bool	GetTopLowerUpperDigit (BLOB_CHOICE_LIST curr_list, BLOB_CHOICE first_lower, BLOB_CHOICE first_upper, BLOB_CHOICE *first_digit) const

int	SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const

ViterbiStateEntry *	GetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES word_res, ViterbiStateEntry_IT vse_it, LanguageModelFlagsType top_choice_flags) const

bool	AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE b, LanguageModelState curr_state, ViterbiStateEntry parent_vse, LMPainPoints pain_points, WERD_RES word_res, BestChoiceBundle best_choice_bundle, BlamerBundle *blamer_bundle)

void	GenerateTopChoiceInfo (ViterbiStateEntry new_vse, const ViterbiStateEntry parent_vse, LanguageModelState *lms)

LanguageModelDawgInfo *	GenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)

LanguageModelNgramInfo *	GenerateNgramInfo (const char unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry parent_vse)

float	ComputeNgramCost (const char unichar, float certainty, float denom, const char context, int unichar_step_len, bool found_small_prob, float *ngram_prob)

float	ComputeDenom (BLOB_CHOICE_LIST *curr_list)

void	FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE b, ViterbiStateEntry parent_vse, WERD_RES word_res, LMConsistencyInfo consistency_info)

void	UpdateBestChoice (ViterbiStateEntry vse, LMPainPoints pain_points, WERD_RES word_res, BestChoiceBundle best_choice_bundle, BlamerBundle *blamer_bundle)

WERD_CHOICE *	ConstructWord (ViterbiStateEntry vse, WERD_RES word_res, DANGERR fixpt, BlamerBundle blamer_bundle, bool *truth_path)

void	ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry parent_vse, WERD_RES word_res, AssociateStats *associate_stats)

bool	PrunablePath (const ViterbiStateEntry &vse)

bool	AcceptablePath (const ViterbiStateEntry &vse)

Protected Attributes
DawgArgs	dawg_args_

float	rating_cert_scale_

const UnicityTable< FontInfo > *	fontinfo_table_

Dict *	dict_

bool	fixed_pitch_

float	max_char_wh_ratio_

STRING	prev_word_str_

int	prev_word_unichar_step_len_

DawgPositionVector	very_beginning_active_dawgs_

DawgPositionVector	beginning_active_dawgs_

bool	acceptable_choice_found_

bool	correct_segmentation_explored_

ParamsModel	params_model_

Constructor & Destructor Documentation

◆ LanguageModel()

tesseract::LanguageModel::LanguageModel	(	const UnicityTable< FontInfo > *	fontinfo_table,
		Dict *	dict
	)

◆ ~LanguageModel()

tesseract::LanguageModel::~LanguageModel ( )

Member Function Documentation

◆ AcceptableChoiceFound()

bool tesseract::LanguageModel::AcceptableChoiceFound ( )

inline

◆ AcceptablePath()

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry & vse )

inlineprotected

◆ AddViterbiStateEntry()

bool tesseract::LanguageModel::AddViterbiStateEntry	(	LanguageModelFlagsType	top_choice_flags,
		float	denom,
		bool	word_end,
		int	curr_col,
		int	curr_row,
		BLOB_CHOICE *	b,
		LanguageModelState *	curr_state,
		ViterbiStateEntry *	parent_vse,
		LMPainPoints *	pain_points,
		WERD_RES *	word_res,
		BestChoiceBundle *	best_choice_bundle,
		BlamerBundle *	blamer_bundle
	)

protected

◆ BOOL_VAR_H() [1/4]

tesseract::LanguageModel::BOOL_VAR_H	(	language_model_ngram_on	,
		false	,
		"Turn on/off the use of character ngram model"
	)

◆ BOOL_VAR_H() [2/4]

tesseract::LanguageModel::BOOL_VAR_H	(	language_model_ngram_use_only_first_uft8_step	,
		false	,
		"Use only the first UTF8 step of the given string" " when computing log probabilities"
	)

◆ BOOL_VAR_H() [3/4]

tesseract::LanguageModel::BOOL_VAR_H	(	language_model_ngram_space_delimited_language	,
		true	,
		"Words are delimited by space"
	)

◆ BOOL_VAR_H() [4/4]

tesseract::LanguageModel::BOOL_VAR_H	(	language_model_use_sigmoidal_certainty	,
		false	,
		"Use sigmoidal score for certainty"
	)

◆ CertaintyScore()

float tesseract::LanguageModel::CertaintyScore ( float cert )

inlineprotected

◆ ComputeAdjustedPathCost()

float tesseract::LanguageModel::ComputeAdjustedPathCost ( ViterbiStateEntry * vse )

protected

◆ ComputeAdjustment()

float tesseract::LanguageModel::ComputeAdjustment	(	int	num_problems,
		float	penalty
	)

inlineprotected

◆ ComputeAssociateStats()

void tesseract::LanguageModel::ComputeAssociateStats	(	int	col,
		int	row,
		float	max_char_wh_ratio,
		ViterbiStateEntry *	parent_vse,
		WERD_RES *	word_res,
		AssociateStats *	associate_stats
	)

inlineprotected

◆ ComputeConsistencyAdjustment()

float tesseract::LanguageModel::ComputeConsistencyAdjustment	(	const LanguageModelDawgInfo *	dawg_info,
		const LMConsistencyInfo &	consistency_info
	)

inlineprotected

◆ ComputeDenom()

float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST * curr_list )

protected

◆ ComputeNgramCost()

float tesseract::LanguageModel::ComputeNgramCost	(	const char *	unichar,
		float	certainty,
		float	denom,
		const char *	context,
		int *	unichar_step_len,
		bool *	found_small_prob,
		float *	ngram_prob
	)

protected

◆ ConstructWord()

WERD_CHOICE * tesseract::LanguageModel::ConstructWord	(	ViterbiStateEntry *	vse,
		WERD_RES *	word_res,
		DANGERR *	fixpt,
		BlamerBundle *	blamer_bundle,
		bool *	truth_path
	)

protected

◆ double_VAR_H() [1/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_ngram_small_prob	,
		0.	000001,
		"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"
	)

◆ double_VAR_H() [2/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_ngram_nonmatch_score	,
		-40.	0,
		"Average classifier score of a non-matching unichar"
	)

◆ double_VAR_H() [3/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_ngram_scale_factor	,
		0.	03,
		"Strength of the character ngram model relative to the" " character classifier "
	)

◆ double_VAR_H() [4/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_ngram_rating_factor	,
		16.	0,
		"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "
	)

◆ double_VAR_H() [5/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_penalty_non_freq_dict_word	,
		0.	1,
		"Penalty for words not in the frequent word dictionary"
	)

◆ double_VAR_H() [6/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_penalty_non_dict_word	,
		0.	15,
		"Penalty for non-dictionary words"
	)

◆ double_VAR_H() [7/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_penalty_punc	,
		0.	2,
		"Penalty for inconsistent punctuation"
	)

◆ double_VAR_H() [8/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_penalty_case	,
		0.	1,
		"Penalty for inconsistent case"
	)

◆ double_VAR_H() [9/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_penalty_script	,
		0.	5,
		"Penalty for inconsistent script"
	)

◆ double_VAR_H() [10/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_penalty_chartype	,
		0.	3,
		"Penalty for inconsistent character type"
	)

◆ double_VAR_H() [11/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_penalty_font	,
		0.	00,
		"Penalty for inconsistent font"
	)

◆ double_VAR_H() [12/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_penalty_spacing	,
		0.	05,
		"Penalty for inconsistent spacing"
	)

◆ double_VAR_H() [13/13]

tesseract::LanguageModel::double_VAR_H	(	language_model_penalty_increment	,
		0.	01,
		"Penalty increment"
	)

◆ ExtractFeaturesFromPath()

void tesseract::LanguageModel::ExtractFeaturesFromPath	(	const ViterbiStateEntry &	vse,
		float	features[]
	)

static

◆ FillConsistencyInfo()

void tesseract::LanguageModel::FillConsistencyInfo	(	int	curr_col,
		bool	word_end,
		BLOB_CHOICE *	b,
		ViterbiStateEntry *	parent_vse,
		WERD_RES *	word_res,
		LMConsistencyInfo *	consistency_info
	)

protected

◆ GenerateDawgInfo()

LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo	(	bool	word_end,
		int	curr_col,
		int	curr_row,
		const BLOB_CHOICE &	b,
		const ViterbiStateEntry *	parent_vse
	)

protected

◆ GenerateNgramInfo()

LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo	(	const char *	unichar,
		float	certainty,
		float	denom,
		int	curr_col,
		int	curr_row,
		float	outline_length,
		const ViterbiStateEntry *	parent_vse
	)

protected

◆ GenerateTopChoiceInfo()

void tesseract::LanguageModel::GenerateTopChoiceInfo	(	ViterbiStateEntry *	new_vse,
		const ViterbiStateEntry *	parent_vse,
		LanguageModelState *	lms
	)

protected

◆ GetNextParentVSE()

ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE	(	bool	just_classified,
		bool	mixed_alnum,
		const BLOB_CHOICE *	bc,
		LanguageModelFlagsType	blob_choice_flags,
		const UNICHARSET &	unicharset,
		WERD_RES *	word_res,
		ViterbiStateEntry_IT *	vse_it,
		LanguageModelFlagsType *	top_choice_flags
	)		const

protected

Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.

◆ getParamsModel()

ParamsModel& tesseract::LanguageModel::getParamsModel ( )

inline

◆ GetTopLowerUpperDigit()

bool tesseract::LanguageModel::GetTopLowerUpperDigit	(	BLOB_CHOICE_LIST *	curr_list,
		BLOB_CHOICE **	first_lower,
		BLOB_CHOICE **	first_upper,
		BLOB_CHOICE **	first_digit
	)		const

protected

Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.

◆ InitForWord()

void tesseract::LanguageModel::InitForWord	(	const WERD_CHOICE *	prev_word,
		bool	fixed_pitch,
		float	max_char_wh_ratio,
		float	rating_cert_scale
	)

◆ INT_VAR_H() [1/6]

tesseract::LanguageModel::INT_VAR_H	(	language_model_debug_level	,
		0	,
		"Language model debug level"
	)

◆ INT_VAR_H() [2/6]

tesseract::LanguageModel::INT_VAR_H	(	language_model_ngram_order	,
		8	,
		"Maximum order of the character ngram model"
	)

◆ INT_VAR_H() [3/6]

tesseract::LanguageModel::INT_VAR_H	(	language_model_viterbi_list_max_num_prunable	,
		10	,
		"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"
	)

◆ INT_VAR_H() [4/6]

tesseract::LanguageModel::INT_VAR_H	(	language_model_viterbi_list_max_size	,
		500	,
		"Maximum size of viterbi lists recorded in BLOB_CHOICEs"
	)

◆ INT_VAR_H() [5/6]

tesseract::LanguageModel::INT_VAR_H	(	language_model_min_compound_length	,
		3	,
		"Minimum length of compound words"
	)

◆ INT_VAR_H() [6/6]

tesseract::LanguageModel::INT_VAR_H	(	wordrec_display_segmentations	,
		0	,
		"Display Segmentations"
	)

◆ PrunablePath()

bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry & vse )

inlineprotected

◆ SetAcceptableChoiceFound()

void tesseract::LanguageModel::SetAcceptableChoiceFound ( bool val )

inline

◆ SetTopParentLowerUpperDigit()

int tesseract::LanguageModel::SetTopParentLowerUpperDigit ( LanguageModelState * parent_node ) const

protected

Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.

◆ UpdateBestChoice()

void tesseract::LanguageModel::UpdateBestChoice	(	ViterbiStateEntry *	vse,
		LMPainPoints *	pain_points,
		WERD_RES *	word_res,
		BestChoiceBundle *	best_choice_bundle,
		BlamerBundle *	blamer_bundle
	)

protected

◆ UpdateState()

bool tesseract::LanguageModel::UpdateState	(	bool	just_classified,
		int	curr_col,
		int	curr_row,
		BLOB_CHOICE_LIST *	curr_list,
		LanguageModelState *	parent_node,
		LMPainPoints *	pain_points,
		WERD_RES *	word_res,
		BestChoiceBundle *	best_choice_bundle,
		BlamerBundle *	blamer_bundle
	)

UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.

This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:

paths that are liked by the language model: either a DAWG or the n-gram model, where active.
paths that represent some kind of top choice. The old permuter permuted the top raw classifier score, the top upper case word and the top lower- case word. UpdateState now concentrates its top-choice paths on top lower-case, top upper-case (or caseless alpha), and top digit sequence, with allowance for continuation of these paths through blobs where such a character does not appear in the choices list.

GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.

float tesseract::LanguageModel::rating_cert_scale_

protected

◆ very_beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::very_beginning_active_dawgs_

protected

The documentation for this class was generated from the following files:

/home/stephane/src/tesseract/src/wordrec/language_model.h
/home/stephane/src/tesseract/src/wordrec/language_model.cpp

Public Member Functions

Static Public Member Functions

Static Public Attributes

Protected Member Functions

Protected Attributes

Constructor & Destructor Documentation

◆ LanguageModel()

◆ ~LanguageModel()

Member Function Documentation

◆ AcceptableChoiceFound()

◆ AcceptablePath()

◆ AddViterbiStateEntry()

◆ BOOL_VAR_H() [1/4]

◆ BOOL_VAR_H() [2/4]

◆ BOOL_VAR_H() [3/4]

◆ BOOL_VAR_H() [4/4]

◆ CertaintyScore()

◆ ComputeAdjustedPathCost()

◆ ComputeAdjustment()

◆ ComputeAssociateStats()

◆ ComputeConsistencyAdjustment()

◆ ComputeDenom()

◆ ComputeNgramCost()

◆ ConstructWord()

◆ double_VAR_H() [1/13]

◆ double_VAR_H() [2/13]

◆ double_VAR_H() [3/13]

◆ double_VAR_H() [4/13]

◆ double_VAR_H() [5/13]

◆ double_VAR_H() [6/13]

◆ double_VAR_H() [7/13]

◆ double_VAR_H() [8/13]

◆ double_VAR_H() [9/13]

◆ double_VAR_H() [10/13]

◆ double_VAR_H() [11/13]

◆ double_VAR_H() [12/13]

◆ double_VAR_H() [13/13]

◆ ExtractFeaturesFromPath()

◆ FillConsistencyInfo()

◆ GenerateDawgInfo()

◆ GenerateNgramInfo()

◆ GenerateTopChoiceInfo()

◆ GetNextParentVSE()

◆ getParamsModel()

◆ GetTopLowerUpperDigit()

◆ InitForWord()

◆ INT_VAR_H() [1/6]

◆ INT_VAR_H() [2/6]

◆ INT_VAR_H() [3/6]

◆ INT_VAR_H() [4/6]

◆ INT_VAR_H() [5/6]

◆ INT_VAR_H() [6/6]

◆ PrunablePath()

◆ SetAcceptableChoiceFound()

◆ SetTopParentLowerUpperDigit()

◆ UpdateBestChoice()

◆ UpdateState()

Member Data Documentation

◆ acceptable_choice_found_

◆ beginning_active_dawgs_

◆ correct_segmentation_explored_

◆ dawg_args_

◆ dict_

◆ fixed_pitch_

◆ fontinfo_table_

◆ kDigitFlag

◆ kLowerCaseFlag

◆ kMaxAvgNgramCost

◆ kSmallestRatingFlag

◆ kUpperCaseFlag

◆ kXhtConsistentFlag

◆ max_char_wh_ratio_

◆ params_model_

◆ prev_word_str_

◆ prev_word_unichar_step_len_

◆ rating_cert_scale_

◆ very_beginning_active_dawgs_