tesseract  3.05.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
STATS Class Reference

#include <statistc.h>

Public Member Functions

 STATS (inT32 min_bucket_value, inT32 max_bucket_value_plus_1)
 
 STATS ()
 
 ~STATS ()
 
bool set_range (inT32 min_bucket_value, inT32 max_bucket_value_plus_1)
 
void clear ()
 
void add (inT32 value, inT32 count)
 
inT32 mode () const
 
double mean () const
 
double sd () const
 
double ile (double frac) const
 
inT32 min_bucket () const
 
inT32 max_bucket () const
 
double median () const
 
inT32 pile_count (inT32 value) const
 
inT32 get_total () const
 
bool local_min (inT32 x) const
 
void smooth (inT32 factor)
 
inT32 cluster (float lower, float upper, float multiple, inT32 max_clusters, STATS *clusters)
 
int top_n_modes (int max_modes, GenericVector< tesseract::KDPairInc< float, int > > *modes) const
 
void print () const
 
void print_summary () const
 
void plot (ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
 
void plotline (ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
 

Detailed Description

Definition at line 33 of file statistc.h.

Constructor & Destructor Documentation

STATS::STATS ( inT32  min_bucket_value,
inT32  max_bucket_value_plus_1 
)

Definition at line 40 of file statistc.cpp.

40  {
41  if (max_bucket_value_plus_1 <= min_bucket_value) {
42  min_bucket_value = 0;
43  max_bucket_value_plus_1 = 1;
44  }
45  rangemin_ = min_bucket_value; // setup
46  rangemax_ = max_bucket_value_plus_1;
47  buckets_ = new inT32[rangemax_ - rangemin_];
48  clear();
49 }
int inT32
Definition: host.h:102
void clear()
Definition: statistc.cpp:81
STATS::STATS ( )

Definition at line 51 of file statistc.cpp.

51  {
52  rangemax_ = 0;
53  rangemin_ = 0;
54  buckets_ = NULL;
55 }
STATS::~STATS ( )

Definition at line 92 of file statistc.cpp.

92  {
93  if (buckets_ != NULL) {
94  delete [] buckets_;
95  buckets_ = NULL;
96  }
97 }

Member Function Documentation

void STATS::add ( inT32  value,
inT32  count 
)

Definition at line 104 of file statistc.cpp.

104  {
105  if (buckets_ == NULL) {
106  return;
107  }
108  value = ClipToRange(value, rangemin_, rangemax_ - 1);
109  buckets_[value - rangemin_] += count;
110  total_count_ += count; // keep count of total
111 }
int count(LIST var_list)
Definition: oldlist.cpp:108
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
void STATS::clear ( )

Definition at line 81 of file statistc.cpp.

81  { // clear out buckets
82  total_count_ = 0;
83  if (buckets_ != NULL)
84  memset(buckets_, 0, (rangemax_ - rangemin_) * sizeof(buckets_[0]));
85 }
inT32 STATS::cluster ( float  lower,
float  upper,
float  multiple,
inT32  max_clusters,
STATS clusters 
)

Definition at line 323 of file statistc.cpp.

327  { // array of clusters
328  BOOL8 new_cluster; // added one
329  float *centres; // cluster centres
330  inT32 entry; // bucket index
331  inT32 cluster; // cluster index
332  inT32 best_cluster; // one to assign to
333  inT32 new_centre = 0; // residual mode
334  inT32 new_mode; // pile count of new_centre
335  inT32 count; // pile to place
336  float dist; // from cluster
337  float min_dist; // from best_cluster
338  inT32 cluster_count; // no of clusters
339 
340  if (buckets_ == NULL || max_clusters < 1)
341  return 0;
342  centres = new float[max_clusters + 1];
343  for (cluster_count = 1; cluster_count <= max_clusters
344  && clusters[cluster_count].buckets_ != NULL
345  && clusters[cluster_count].total_count_ > 0;
346  cluster_count++) {
347  centres[cluster_count] =
348  static_cast<float>(clusters[cluster_count].ile(0.5));
349  new_centre = clusters[cluster_count].mode();
350  for (entry = new_centre - 1; centres[cluster_count] - entry < lower
351  && entry >= rangemin_
352  && pile_count(entry) <= pile_count(entry + 1);
353  entry--) {
354  count = pile_count(entry) - clusters[0].pile_count(entry);
355  if (count > 0) {
356  clusters[cluster_count].add(entry, count);
357  clusters[0].add (entry, count);
358  }
359  }
360  for (entry = new_centre + 1; entry - centres[cluster_count] < lower
361  && entry < rangemax_
362  && pile_count(entry) <= pile_count(entry - 1);
363  entry++) {
364  count = pile_count(entry) - clusters[0].pile_count(entry);
365  if (count > 0) {
366  clusters[cluster_count].add(entry, count);
367  clusters[0].add(entry, count);
368  }
369  }
370  }
371  cluster_count--;
372 
373  if (cluster_count == 0) {
374  clusters[0].set_range(rangemin_, rangemax_);
375  }
376  do {
377  new_cluster = FALSE;
378  new_mode = 0;
379  for (entry = 0; entry < rangemax_ - rangemin_; entry++) {
380  count = buckets_[entry] - clusters[0].buckets_[entry];
381  //remaining pile
382  if (count > 0) { //any to handle
383  min_dist = static_cast<float>(MAX_INT32);
384  best_cluster = 0;
385  for (cluster = 1; cluster <= cluster_count; cluster++) {
386  dist = entry + rangemin_ - centres[cluster];
387  //find distance
388  if (dist < 0)
389  dist = -dist;
390  if (dist < min_dist) {
391  min_dist = dist; //find least
392  best_cluster = cluster;
393  }
394  }
395  if (min_dist > upper //far enough for new
396  && (best_cluster == 0
397  || entry + rangemin_ > centres[best_cluster] * multiple
398  || entry + rangemin_ < centres[best_cluster] / multiple)) {
399  if (count > new_mode) {
400  new_mode = count;
401  new_centre = entry + rangemin_;
402  }
403  }
404  }
405  }
406  // need new and room
407  if (new_mode > 0 && cluster_count < max_clusters) {
408  cluster_count++;
409  new_cluster = TRUE;
410  if (!clusters[cluster_count].set_range(rangemin_, rangemax_)) {
411  delete [] centres;
412  return 0;
413  }
414  centres[cluster_count] = static_cast<float>(new_centre);
415  clusters[cluster_count].add(new_centre, new_mode);
416  clusters[0].add(new_centre, new_mode);
417  for (entry = new_centre - 1; centres[cluster_count] - entry < lower
418  && entry >= rangemin_
419  && pile_count (entry) <= pile_count(entry + 1); entry--) {
420  count = pile_count(entry) - clusters[0].pile_count(entry);
421  if (count > 0) {
422  clusters[cluster_count].add(entry, count);
423  clusters[0].add(entry, count);
424  }
425  }
426  for (entry = new_centre + 1; entry - centres[cluster_count] < lower
427  && entry < rangemax_
428  && pile_count (entry) <= pile_count(entry - 1); entry++) {
429  count = pile_count(entry) - clusters[0].pile_count(entry);
430  if (count > 0) {
431  clusters[cluster_count].add(entry, count);
432  clusters[0].add (entry, count);
433  }
434  }
435  centres[cluster_count] =
436  static_cast<float>(clusters[cluster_count].ile(0.5));
437  }
438  } while (new_cluster && cluster_count < max_clusters);
439  delete [] centres;
440  return cluster_count;
441 }
int count(LIST var_list)
Definition: oldlist.cpp:108
#define TRUE
Definition: capi.h:45
inT32 mode() const
Definition: statistc.cpp:118
#define MAX_INT32
Definition: host.h:120
int inT32
Definition: host.h:102
double ile(double frac) const
Definition: statistc.cpp:177
#define FALSE
Definition: capi.h:46
inT32 pile_count(inT32 value) const
Definition: statistc.h:78
unsigned char BOOL8
Definition: host.h:113
bool set_range(inT32 min_bucket_value, inT32 max_bucket_value_plus_1)
Definition: statistc.cpp:62
inT32 cluster(float lower, float upper, float multiple, inT32 max_clusters, STATS *clusters)
Definition: statistc.cpp:323
void add(inT32 value, inT32 count)
Definition: statistc.cpp:104
inT32 STATS::get_total ( ) const
inline

Definition at line 86 of file statistc.h.

86  {
87  return total_count_; // total of all piles
88  }
double STATS::ile ( double  frac) const

Definition at line 177 of file statistc.cpp.

177  {
178  if (buckets_ == NULL || total_count_ == 0) {
179  return static_cast<double>(rangemin_);
180  }
181 #if 0
182  // TODO(rays) The existing code doesn't seem to be doing the right thing
183  // with target a double but this substitute crashes the code that uses it.
184  // Investigate and fix properly.
185  int target = IntCastRounded(frac * total_count_);
186  target = ClipToRange(target, 1, total_count_);
187 #else
188  double target = frac * total_count_;
189  target = ClipToRange(target, 1.0, static_cast<double>(total_count_));
190 #endif
191  int sum = 0;
192  int index = 0;
193  for (index = 0; index < rangemax_ - rangemin_ && sum < target;
194  sum += buckets_[index++]);
195  if (index > 0) {
196  ASSERT_HOST(buckets_[index - 1] > 0);
197  return rangemin_ + index -
198  static_cast<double>(sum - target) / buckets_[index - 1];
199  } else {
200  return static_cast<double>(rangemin_);
201  }
202 }
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
int IntCastRounded(double x)
Definition: helpers.h:172
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool STATS::local_min ( inT32  x) const

Definition at line 265 of file statistc.cpp.

265  {
266  if (buckets_ == NULL) {
267  return false;
268  }
269  x = ClipToRange(x, rangemin_, rangemax_ - 1) - rangemin_;
270  if (buckets_[x] == 0)
271  return true;
272  inT32 index; // table index
273  for (index = x - 1; index >= 0 && buckets_[index] == buckets_[x]; --index);
274  if (index >= 0 && buckets_[index] < buckets_[x])
275  return false;
276  for (index = x + 1; index < rangemax_ - rangemin_ &&
277  buckets_[index] == buckets_[x]; ++index);
278  if (index < rangemax_ - rangemin_ && buckets_[index] < buckets_[x])
279  return false;
280  else
281  return true;
282 }
int inT32
Definition: host.h:102
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
inT32 STATS::max_bucket ( ) const

Definition at line 224 of file statistc.cpp.

224  { // Find max
225  if (buckets_ == NULL || total_count_ == 0) {
226  return rangemin_;
227  }
228  inT32 max;
229  for (max = rangemax_ - rangemin_ - 1; max > 0 && buckets_[max] == 0; max--);
230  return rangemin_ + max;
231 }
int inT32
Definition: host.h:102
double STATS::mean ( ) const

Definition at line 138 of file statistc.cpp.

138  { //get mean of samples
139  if (buckets_ == NULL || total_count_ <= 0) {
140  return static_cast<double>(rangemin_);
141  }
142  inT64 sum = 0;
143  for (int index = rangemax_ - rangemin_ - 1; index >= 0; --index) {
144  sum += static_cast<inT64>(index) * buckets_[index];
145  }
146  return static_cast<double>(sum) / total_count_ + rangemin_;
147 }
long long int inT64
Definition: host.h:108
double STATS::median ( ) const

Definition at line 242 of file statistc.cpp.

242  { //get median
243  if (buckets_ == NULL) {
244  return static_cast<double>(rangemin_);
245  }
246  double median = ile(0.5);
247  int median_pile = static_cast<int>(floor(median));
248  if ((total_count_ > 1) && (pile_count(median_pile) == 0)) {
249  inT32 min_pile;
250  inT32 max_pile;
251  /* Find preceding non zero pile */
252  for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--);
253  /* Find following non zero pile */
254  for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++);
255  median = (min_pile + max_pile) / 2.0;
256  }
257  return median;
258 }
int inT32
Definition: host.h:102
double ile(double frac) const
Definition: statistc.cpp:177
inT32 pile_count(inT32 value) const
Definition: statistc.h:78
double median() const
Definition: statistc.cpp:242
inT32 STATS::min_bucket ( ) const

Definition at line 209 of file statistc.cpp.

209  { // Find min
210  if (buckets_ == NULL || total_count_ == 0) {
211  return rangemin_;
212  }
213  inT32 min = 0;
214  for (min = 0; (min < rangemax_ - rangemin_) && (buckets_[min] == 0); min++);
215  return rangemin_ + min;
216 }
int inT32
Definition: host.h:102
inT32 STATS::mode ( ) const

Definition at line 118 of file statistc.cpp.

118  { // get mode of samples
119  if (buckets_ == NULL) {
120  return rangemin_;
121  }
122  inT32 max = buckets_[0]; // max cell count
123  inT32 maxindex = 0; // index of max
124  for (int index = rangemax_ - rangemin_ - 1; index > 0; --index) {
125  if (buckets_[index] > max) {
126  max = buckets_[index]; // find biggest
127  maxindex = index;
128  }
129  }
130  return maxindex + rangemin_; // index of biggest
131 }
int inT32
Definition: host.h:102
inT32 STATS::pile_count ( inT32  value) const
inline

Definition at line 78 of file statistc.h.

78  {
79  if (value <= rangemin_)
80  return buckets_[0];
81  if (value >= rangemax_ - 1)
82  return buckets_[rangemax_ - rangemin_ - 1];
83  return buckets_[value - rangemin_];
84  }
void STATS::plot ( ScrollView window,
float  xorigin,
float  yorigin,
float  xscale,
float  yscale,
ScrollView::Color  colour 
) const

Definition at line 588 of file statistc.cpp.

593  { // colour to draw in
594  if (buckets_ == NULL) {
595  return;
596  }
597  window->Pen(colour);
598 
599  for (int index = 0; index < rangemax_ - rangemin_; index++) {
600  window->Rectangle( xorigin + xscale * index, yorigin,
601  xorigin + xscale * (index + 1),
602  yorigin + yscale * buckets_[index]);
603  }
604 }
void Pen(Color color)
Definition: scrollview.cpp:726
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
void STATS::plotline ( ScrollView window,
float  xorigin,
float  yorigin,
float  xscale,
float  yscale,
ScrollView::Color  colour 
) const

Definition at line 615 of file statistc.cpp.

620  { // colour to draw in
621  if (buckets_ == NULL) {
622  return;
623  }
624  window->Pen(colour);
625  window->SetCursor(xorigin, yorigin + yscale * buckets_[0]);
626  for (int index = 0; index < rangemax_ - rangemin_; index++) {
627  window->DrawTo(xorigin + xscale * index,
628  yorigin + yscale * buckets_[index]);
629  }
630 }
void SetCursor(int x, int y)
Definition: scrollview.cpp:525
void Pen(Color color)
Definition: scrollview.cpp:726
void DrawTo(int x, int y)
Definition: scrollview.cpp:531
void STATS::print ( ) const

Definition at line 537 of file statistc.cpp.

537  {
538  if (buckets_ == NULL) {
539  return;
540  }
541  inT32 min = min_bucket() - rangemin_;
542  inT32 max = max_bucket() - rangemin_;
543 
544  int num_printed = 0;
545  for (int index = min; index <= max; index++) {
546  if (buckets_[index] != 0) {
547  tprintf("%4d:%-3d ", rangemin_ + index, buckets_[index]);
548  if (++num_printed % 8 == 0)
549  tprintf ("\n");
550  }
551  }
552  tprintf ("\n");
553  print_summary();
554 }
inT32 max_bucket() const
Definition: statistc.cpp:224
int inT32
Definition: host.h:102
#define tprintf(...)
Definition: tprintf.h:31
inT32 min_bucket() const
Definition: statistc.cpp:209
void print_summary() const
Definition: statistc.cpp:563
void STATS::print_summary ( ) const

Definition at line 563 of file statistc.cpp.

563  {
564  if (buckets_ == NULL) {
565  return;
566  }
567  inT32 min = min_bucket();
568  inT32 max = max_bucket();
569  tprintf("Total count=%d\n", total_count_);
570  tprintf("Min=%.2f Really=%d\n", ile(0.0), min);
571  tprintf("Lower quartile=%.2f\n", ile(0.25));
572  tprintf("Median=%.2f, ile(0.5)=%.2f\n", median(), ile(0.5));
573  tprintf("Upper quartile=%.2f\n", ile(0.75));
574  tprintf("Max=%.2f Really=%d\n", ile(1.0), max);
575  tprintf("Range=%d\n", max + 1 - min);
576  tprintf("Mean= %.2f\n", mean());
577  tprintf("SD= %.2f\n", sd());
578 }
inT32 max_bucket() const
Definition: statistc.cpp:224
int inT32
Definition: host.h:102
double mean() const
Definition: statistc.cpp:138
double ile(double frac) const
Definition: statistc.cpp:177
#define tprintf(...)
Definition: tprintf.h:31
inT32 min_bucket() const
Definition: statistc.cpp:209
double sd() const
Definition: statistc.cpp:154
double median() const
Definition: statistc.cpp:242
double STATS::sd ( ) const

Definition at line 154 of file statistc.cpp.

154  { //standard deviation
155  if (buckets_ == NULL || total_count_ <= 0) {
156  return 0.0;
157  }
158  inT64 sum = 0;
159  double sqsum = 0.0;
160  for (int index = rangemax_ - rangemin_ - 1; index >= 0; --index) {
161  sum += static_cast<inT64>(index) * buckets_[index];
162  sqsum += static_cast<double>(index) * index * buckets_[index];
163  }
164  double variance = static_cast<double>(sum) / total_count_;
165  variance = sqsum / total_count_ - variance * variance;
166  if (variance > 0.0)
167  return sqrt(variance);
168  return 0.0;
169 }
long long int inT64
Definition: host.h:108
bool STATS::set_range ( inT32  min_bucket_value,
inT32  max_bucket_value_plus_1 
)

Definition at line 62 of file statistc.cpp.

62  {
63  if (max_bucket_value_plus_1 <= min_bucket_value) {
64  return false;
65  }
66  if (rangemax_ - rangemin_ != max_bucket_value_plus_1 - min_bucket_value) {
67  delete [] buckets_;
68  buckets_ = new inT32[max_bucket_value_plus_1 - min_bucket_value];
69  }
70  rangemin_ = min_bucket_value; // setup
71  rangemax_ = max_bucket_value_plus_1;
72  clear(); // zero it
73  return true;
74 }
int inT32
Definition: host.h:102
void clear()
Definition: statistc.cpp:81
void STATS::smooth ( inT32  factor)

Definition at line 292 of file statistc.cpp.

292  {
293  if (buckets_ == NULL || factor < 2) {
294  return;
295  }
296  STATS result(rangemin_, rangemax_);
297  int entrycount = rangemax_ - rangemin_;
298  for (int entry = 0; entry < entrycount; entry++) {
299  //centre weight
300  int count = buckets_[entry] * factor;
301  for (int offset = 1; offset < factor; offset++) {
302  if (entry - offset >= 0)
303  count += buckets_[entry - offset] * (factor - offset);
304  if (entry + offset < entrycount)
305  count += buckets_[entry + offset] * (factor - offset);
306  }
307  result.add(entry + rangemin_, count);
308  }
309  total_count_ = result.total_count_;
310  memcpy(buckets_, result.buckets_, entrycount * sizeof(buckets_[0]));
311 }
int count(LIST var_list)
Definition: oldlist.cpp:108
Definition: statistc.h:33
int STATS::top_n_modes ( int  max_modes,
GenericVector< tesseract::KDPairInc< float, int > > *  modes 
) const

Definition at line 472 of file statistc.cpp.

473  {
474  if (max_modes <= 0) return 0;
475  int src_count = rangemax_ - rangemin_;
476  // Used copies the counts in buckets_ as they get used.
477  STATS used(rangemin_, rangemax_);
478  modes->truncate(0);
479  // Total count of the smallest peak found so far.
480  int least_count = 1;
481  // Mode that is used as a seed for each peak
482  int max_count = 0;
483  do {
484  // Find an unused mode.
485  max_count = 0;
486  int max_index = 0;
487  for (int src_index = 0; src_index < src_count; src_index++) {
488  int pile_count = buckets_[src_index] - used.buckets_[src_index];
489  if (pile_count > max_count) {
490  max_count = pile_count;
491  max_index = src_index;
492  }
493  }
494  if (max_count > 0) {
495  // Copy the bucket count to used so it doesn't get found again.
496  used.buckets_[max_index] = max_count;
497  // Get the entire peak.
498  double total_value = max_index * max_count;
499  int total_count = max_count;
500  int prev_pile = max_count;
501  for (int offset = 1; max_index + offset < src_count; ++offset) {
502  if (!GatherPeak(max_index + offset, buckets_, used.buckets_,
503  &prev_pile, &total_count, &total_value))
504  break;
505  }
506  prev_pile = buckets_[max_index];
507  for (int offset = 1; max_index - offset >= 0; ++offset) {
508  if (!GatherPeak(max_index - offset, buckets_, used.buckets_,
509  &prev_pile, &total_count, &total_value))
510  break;
511  }
512  if (total_count > least_count || modes->size() < max_modes) {
513  // We definitely want this mode, so if we have enough discard the least.
514  if (modes->size() == max_modes)
515  modes->truncate(max_modes - 1);
516  int target_index = 0;
517  // Linear search for the target insertion point.
518  while (target_index < modes->size() &&
519  (*modes)[target_index].data >= total_count)
520  ++target_index;
521  float peak_mean =
522  static_cast<float>(total_value / total_count + rangemin_);
523  modes->insert(KDPairInc<float, int>(peak_mean, total_count),
524  target_index);
525  least_count = modes->back().data;
526  }
527  }
528  } while (max_count > 0);
529  return modes->size();
530 }
T & back() const
void truncate(int size)
int size() const
Definition: genericvector.h:72
Definition: statistc.h:33
inT32 pile_count(inT32 value) const
Definition: statistc.h:78
void insert(T t, int index)

The documentation for this class was generated from the following files: