| File: | blib/lib/Stats/LikeR.pm |
| Coverage: | 79.7% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | #!/usr/bin/env perl | |||||
| 2 | require 5.010; | |||||
| 3 | package Stats::LikeR; | |||||
| 4 | our $VERSION = 0.01; | |||||
| 5 | require XSLoader; | |||||
| 6 | 1 1 1 | 37641 1 2 | use DDP {output => 'STDOUT', array_max => 10, show_memsize => 1}; | |||
| 7 | 1 1 1 | 27 1 2 | use Devel::Confess 'color'; | |||
| 8 | 1 1 1 | 25 1 14 | use warnings FATAL => 'all'; | |||
| 9 | 1 1 1 | 81 4423 2 | use autodie ':default'; | |||
| 10 | 1 1 1 | 1773 1 504 | use Exporter 'import'; | |||
| 11 | XSLoader::load('Stats::LikeR', $VERSION); | |||||
| 12 | our @EXPORT_OK = qw(aov chisq_test cor cor_test cov fisher_test glm hist kruskal_test ks_test lm matrix mean median min max p_adjust power_t_test quantile rbinom read_table rnorm runif scale sd seq shapiro_test t_test var wilcox_test write_table); | |||||
| 13 | our @EXPORT = @EXPORT_OK; | |||||
| 14 | ||||||
| 15 | require XSLoader; | |||||
| 16 | ||||||
| 17 | # Wrapper to mimic R's structure | |||||
| 18 | sub chisq_test { | |||||
| 19 | 4 | 12674 | my ($data) = @_; | |||
| 20 | ||||||
| 21 | 4 | 9 | die 'Input must be an array reference' unless ref($data) eq 'ARRAY'; | |||
| 22 | ||||||
| 23 | # The XS function handles the heavy lifting | |||||
| 24 | 3 | 11 | my $result = _chisq_c($data); | |||
| 25 | ||||||
| 26 | # Format the output to look like R's htest object | |||||
| 27 | return { | |||||
| 28 | 'statistic' => { 'X-squared' => $result->{statistic} }, | |||||
| 29 | 'parameter' => { 'df' => $result->{df} }, | |||||
| 30 | 'p.value' => $result->{p_value}, | |||||
| 31 | 'method' => $result->{method}, | |||||
| 32 | 'data.name' => 'Perl ArrayRef', | |||||
| 33 | 'observed' => $data, | |||||
| 34 | 'expected' => $result->{expected} | |||||
| 35 | 3 | 11 | }; | |||
| 36 | } | |||||
| 37 | ||||||
| 38 | sub read_table { | |||||
| 39 | 14 | 180459 | my $file = shift; | |||
| 40 | 14 | 47 | die "\"$file\" is either unreadable or not a file" unless -r -f $file; | |||
| 41 | 14 | 22 | my %args = ( | |||
| 42 | sep => ',', comment => '#', | |||||
| 43 | @_, | |||||
| 44 | ); | |||||
| 45 | 14 84 | 7 52 | my %allowed_args = map {$_ => 1} ( | |||
| 46 | 'comment', 'output.type', 'filter', 'row.names', 'sep', 'substitutions' | |||||
| 47 | ); | |||||
| 48 | 14 40 | 12 25 | my @undef_args = sort grep {!$allowed_args{$_}} keys %args; | |||
| 49 | 14 | 17 | my $current_sub = (split(/::/,(caller(0))[3]))[-1]; | |||
| 50 | 14 | 107 | if (scalar @undef_args > 0) { | |||
| 51 | 0 | 0 | p @undef_args; | |||
| 52 | 0 | 0 | die "the above args aren't defined for $current_sub"; | |||
| 53 | } | |||||
| 54 | 14 | 15 | $args{'output.type'} = $args{'output.type'} // 'aoh'; | |||
| 55 | 14 | 24 | if ($args{'output.type'} !~ m/^(?:aoh|hoa|hoh)$/) { | |||
| 56 | 1 | 8 | die "\"$args{'output.type'}\" isn't allowed"; | |||
| 57 | } | |||||
| 58 | # Normalize the filter argument | |||||
| 59 | 13 | 5 | my $filter = $args{filter}; | |||
| 60 | 13 | 22 | if (defined $filter && ref($filter) eq 'CODE') { | |||
| 61 | 0 | 0 | $filter = { 0 => $filter }; # 0 means the whole row | |||
| 62 | } elsif (defined $filter && ref($filter) ne 'HASH') { | |||||
| 63 | 0 | 0 | die "'filter' must be a CODE or HASH reference"; | |||
| 64 | } | |||||
| 65 | 13 | 6 | my (@data, %data, @header, %mapped_filters); | |||
| 66 | # Execute the fast C-state machine. Pass an anonymous coderef to process streams. | |||||
| 67 | # This bypasses creating an intermediate AoA memory spike. | |||||
| 68 | _parse_csv_file($file, $args{sep} // '', $args{comment} // '', sub { | |||||
| 69 | 3961 | 1420 | my ($line_ref) = @_; | |||
| 70 | 3961 | 3407 | my @line = @$line_ref; | |||
| 71 | 3961 | 1509 | if (!@header) { | |||
| 72 | # --- HEADER PROCESSING --- | |||||
| 73 | 13 | 41 | $line[0] =~ s/^\Q$args{comment}\E// if @line && defined $line[0]; | |||
| 74 | 13 0 | 14 0 | while (@line && $line[-1] eq '') { pop @line } | |||
| 75 | 13 | 17 | @header = @line; | |||
| 76 | # R-LIKE BEHAVIOR | |||||
| 77 | 13 | 13 | if ((scalar @header > 0) && ($header[0] eq '')) { | |||
| 78 | 7 | 4 | $header[0] = 'row_name'; | |||
| 79 | } | |||||
| 80 | 13 | 12 | if (($args{'output.type'} eq 'hoh') && (not defined $args{'row.names'})) { | |||
| 81 | 2 | 2 | $args{'row.names'} = $header[0]; | |||
| 82 | } | |||||
| 83 | 13 28 | 9 12 | if ((defined $args{'row.names'}) && (!grep {$_ eq $args{'row.names'}} @header)) { | |||
| 84 | 0 | 0 | die "\"$args{'row.names'}\" isn't in the header of $file"; | |||
| 85 | } | |||||
| 86 | # Map filters to 1-based indices (or 0 for whole row) | |||||
| 87 | 13 | 8 | if ($filter) { | |||
| 88 | 3 | 2 | for my $k (keys %$filter) { | |||
| 89 | 3 | 4 | if ($k =~ /^\d+$/) { | |||
| 90 | 0 | 0 | $mapped_filters{$k} = $filter->{$k}; | |||
| 91 | } else { | |||||
| 92 | 3 42 | 4 16 | my ($idx) = grep { $header[$_] eq $k } 0..$#header; | |||
| 93 | 3 | 3 | die "Filter column '$k' not found in header" unless defined $idx; | |||
| 94 | 3 | 4 | $mapped_filters{$idx + 1} = $filter->{$k}; | |||
| 95 | } | |||||
| 96 | } | |||||
| 97 | } | |||||
| 98 | 13 | 39 | return; # Equivalent to 'next' out of the closure | |||
| 99 | } | |||||
| 100 | # Check for column alignment | |||||
| 101 | 3948 | 1507 | if (scalar @line != scalar @header) { | |||
| 102 | 0 | 0 | die "Alignment error on $file (" . scalar(@line) . " fields vs " . scalar(@header) . " headers)."; | |||
| 103 | } | |||||
| 104 | # --- DATA PROCESSING --- | |||||
| 105 | 3948 | 1222 | my %line_hash; | |||
| 106 | 3948 | 1628 | for my $i (0 .. $#header) { | |||
| 107 | 55460 | 30626 | if (!defined($line[$i]) || $line[$i] eq '') { | |||
| 108 | 0 | 0 | $line_hash{$header[$i]} = 'NA'; | |||
| 109 | } else { | |||||
| 110 | 55460 | 23978 | $line_hash{$header[$i]} = $line[$i]; | |||
| 111 | } | |||||
| 112 | } | |||||
| 113 | # --- APPLY FILTERS --- | |||||
| 114 | 3948 | 1242 | my $skip = 0; | |||
| 115 | 3948 | 1431 | if (%mapped_filters) { | |||
| 116 | 1845 0 | 761 0 | foreach my $fld (sort { $a <=> $b } keys %mapped_filters) { | |||
| 117 | 1845 | 3349 | local %_ = %line_hash; # Make %_ available to the callback | |||
| 118 | 1845 | 1346 | local $_ = $fld == 0 ? $line_ref : $line[$fld - 1]; # Localize $_ | |||
| 119 | ||||||
| 120 | 1845 | 803 | my $keep = $mapped_filters{$fld}->($line_ref, \%line_hash); | |||
| 121 | 1845 | 1292 | if (!$keep) { | |||
| 122 | 1131 | 358 | $skip = 1; | |||
| 123 | 1131 | 739 | last; | |||
| 124 | } | |||||
| 125 | ||||||
| 126 | # If the callback modified $_, write the mutation back to the data | |||||
| 127 | 714 | 298 | if ($fld > 0) { | |||
| 128 | 714 | 261 | $line[$fld - 1] = $_; | |||
| 129 | 714 | 804 | $line_hash{$header[$fld - 1]} = (defined($_) && $_ eq '') ? 'NA' : $_; | |||
| 130 | } | |||||
| 131 | } | |||||
| 132 | } | |||||
| 133 | 3948 | 4349 | return if $skip; # Reject the row if it failed the filter, skipping memory allocation | |||
| 134 | ||||||
| 135 | # Populate requested data structure | |||||
| 136 | 2817 | 1369 | if ($args{'output.type'} eq 'aoh') { | |||
| 137 | 855 | 2220 | push @data, \%line_hash; | |||
| 138 | } elsif ($args{'output.type'} eq 'hoa') { | |||||
| 139 | 1109 | 344 | foreach my $col (@header) { | |||
| 140 | 15738 15738 | 4327 10332 | push @{ $data{$col} }, $line_hash{$col}; | |||
| 141 | } | |||||
| 142 | } elsif ($args{'output.type'} eq 'hoh') { | |||||
| 143 | 853 | 317 | my $row_name = $line_hash{$args{'row.names'}}; | |||
| 144 | 853 | 294 | foreach my $col (@header) { | |||
| 145 | 11942 | 4458 | next if $col eq $args{'row.names'}; | |||
| 146 | 11089 | 7469 | $data{$col}{$row_name} = $line_hash{$col}; | |||
| 147 | } | |||||
| 148 | } | |||||
| 149 | 13 | 379 | }); | |||
| 150 | 13 | 127 | @header = (); | |||
| 151 | 13 | 8 | %mapped_filters = (); | |||
| 152 | 13 | 23 | if ($args{'output.type'} eq 'aoh') { | |||
| 153 | 7 | 5 | undef %data; | |||
| 154 | 7 | 3 | my $final_ref = \@data; | |||
| 155 | 7 | 294 | return $final_ref; | |||
| 156 | } elsif ($args{'output.type'} =~ m/^(?:hoa|hoh)$/) { | |||||
| 157 | 6 | 2 | @data = (); | |||
| 158 | 6 | 583 | return \%data; | |||
| 159 | } | |||||
| 160 | } | |||||
| 161 | ||||||
| 162 | #sub write_table { | |||||
| 163 | # my $data_ref = (ref($_[0]) eq 'HASH' || ref($_[0]) eq 'ARRAY') ? shift : undef; | |||||
| 164 | # my $file = shift; | |||||
| 165 | # my %args = ( | |||||
| 166 | # sep => ',', | |||||
| 167 | # 'row.names' => 1, | |||||
| 168 | # @_, | |||||
| 169 | # ); | |||||
| 170 | # my $current_sub = (split(/::/,(caller(0))[3]))[-1]; | |||||
| 171 | # $args{data} //= $data_ref; | |||||
| 172 | # my %allowed = map { $_ => 1 } qw(data file row.names sep col.names); | |||||
| 173 | # my @err = grep { !$allowed{$_} } keys %args; | |||||
| 174 | # if (@err > 0) { | |||||
| 175 | # die "$current_sub: Unknown arguments passed: " . join(", ", @err) . "\n"; | |||||
| 176 | # } | |||||
| 177 | # die "$current_sub: 'data' must be a HASH or ARRAY reference\n" | |||||
| 178 | # unless defined $args{data} && (ref($args{data}) eq 'HASH' || ref($args{data}) eq 'ARRAY'); | |||||
| 179 | ||||||
| 180 | # my $col_names = $args{'col.names'}; | |||||
| 181 | # if (defined $col_names && ref($col_names) ne 'ARRAY') { | |||||
| 182 | # die "$current_sub: 'col.names' must be an ARRAY reference\n"; | |||||
| 183 | # } | |||||
| 184 | # my $data = $args{data}; | |||||
| 185 | # my $sep = $args{sep}; | |||||
| 186 | # my $inc_rownames = $args{'row.names'}; | |||||
| 187 | # my $quote_field = sub { | |||||
| 188 | # my ($val, $sep) = @_; | |||||
| 189 | # return '' unless defined $val; | |||||
| 190 | # die "$current_sub: Cannot write nested reference types to table\n" if ref($val); | |||||
| 191 | # | |||||
| 192 | # my $str = "$val"; | |||||
| 193 | # if (index($str, $sep) != -1 || index($str, '"') != -1 || $str =~ /[\r\n]/) { | |||||
| 194 | # $str =~ s/"/""/g; | |||||
| 195 | # $str = qq{"$str"}; | |||||
| 196 | # } | |||||
| 197 | # return $str; | |||||
| 198 | # }; | |||||
| 199 | # my $data_type = ref $data; | |||||
| 200 | # my ($is_hoh, $is_hoa, $is_aoh) = (0, 0, 0); | |||||
| 201 | # my @rows; | |||||
| 202 | # if ($data_type eq 'HASH') { | |||||
| 203 | # @rows = keys %$data; | |||||
| 204 | # return if @rows == 0; | |||||
| 205 | # my $first_type = ref $data->{$rows[0]}; | |||||
| 206 | # die "$current_sub: Data values must be either all HASHes or all ARRAYs\n" | |||||
| 207 | # unless $first_type eq 'HASH' || $first_type eq 'ARRAY'; | |||||
| 208 | ||||||
| 209 | # my @type_err = grep { ref $data->{$_} ne $first_type } @rows; | |||||
| 210 | # if (@type_err > 0) { | |||||
| 211 | # die "$current_sub: Mixed data types detected. Ensure all values are $first_type references.\n"; | |||||
| 212 | # } | |||||
| 213 | # $is_hoh = ($first_type eq 'HASH'); | |||||
| 214 | # $is_hoa = ($first_type eq 'ARRAY'); | |||||
| 215 | # } else { | |||||
| 216 | # return if @$data == 0; | |||||
| 217 | ||||||
| 218 | # my $first_elem = $data->[0]; | |||||
| 219 | # die "$current_sub: For ARRAY data, all elements must be HASH references (Array of Hashes)\n" | |||||
| 220 | # unless defined $first_elem && ref($first_elem) eq 'HASH'; | |||||
| 221 | ||||||
| 222 | # my @type_err = grep { !defined($_) || ref($_) ne 'HASH' } @$data; | |||||
| 223 | # if (@type_err > 0) { | |||||
| 224 | # die "$current_sub: Mixed data types detected in Array of Hashes. All elements must be HASH references.\n"; | |||||
| 225 | # } | |||||
| 226 | # $is_aoh = 1; | |||||
| 227 | # } | |||||
| 228 | # open my $fh, '>', $file or die "$current_sub: Could not open '$file' for writing: $!\n"; | |||||
| 229 | # if ($is_hoh) { | |||||
| 230 | # my @headers; | |||||
| 231 | # if (defined $col_names) { # Bug 6 fix: was "if ($col_names)" â use defined | |||||
| 232 | # @headers = @$col_names; | |||||
| 233 | # } else { | |||||
| 234 | # my %col_map; | |||||
| 235 | # for my $r (@rows) { | |||||
| 236 | # $col_map{$_} = 1 for keys %{ $data->{$r} }; | |||||
| 237 | # } | |||||
| 238 | # @headers = sort keys %col_map; | |||||
| 239 | # } | |||||
| 240 | ||||||
| 241 | # my @header_row = @headers; | |||||
| 242 | # unshift @header_row, '' if $inc_rownames; | |||||
| 243 | # @header_row = map { $quote_field->($_, $sep) } @header_row; | |||||
| 244 | # print $fh join($sep, @header_row) . "\n"; | |||||
| 245 | ||||||
| 246 | # for my $r (sort @rows) { | |||||
| 247 | # my @row_data = map { defined $data->{$r}{$_} ? $data->{$r}{$_} : "NA" } @headers; | |||||
| 248 | # unshift @row_data, $r if $inc_rownames; | |||||
| 249 | # my @quoted = map { $quote_field->($_, $sep) } @row_data; | |||||
| 250 | # print $fh join($sep, @quoted) . "\n"; | |||||
| 251 | # } | |||||
| 252 | # } elsif ($is_hoa) { | |||||
| 253 | # # 1. Find the maximum number of rows | |||||
| 254 | # my $max_rows = 0; | |||||
| 255 | # foreach my $col (keys %$data) { | |||||
| 256 | # my $len = scalar @{ $data->{$col} }; | |||||
| 257 | # $max_rows = $len if $len > $max_rows; | |||||
| 258 | # } | |||||
| 259 | # $max_rows--; # Convert length to max index | |||||
| 260 | ||||||
| 261 | # # 2. Determine headers | |||||
| 262 | # my @headers; | |||||
| 263 | # if (defined $col_names) { | |||||
| 264 | # @headers = @$col_names; | |||||
| 265 | # } else { | |||||
| 266 | # @headers = sort keys %$data; | |||||
| 267 | # } | |||||
| 268 | # | |||||
| 269 | # die "Could not get headers in $current_sub" if @headers == 0; | |||||
| 270 | ||||||
| 271 | # # Bug 5 fix: if row.names is a column name (non-numeric string), pull it out to be first | |||||
| 272 | # my $rownames_col; | |||||
| 273 | # if ($inc_rownames && $inc_rownames =~ /\D/) { | |||||
| 274 | # $rownames_col = $inc_rownames; | |||||
| 275 | # @headers = grep { $_ ne $rownames_col } @headers; | |||||
| 276 | # } | |||||
| 277 | ||||||
| 278 | # # 3. Print Header Row | |||||
| 279 | # my @header_row = @headers; | |||||
| 280 | # unshift @header_row, '' if $inc_rownames; | |||||
| 281 | # @header_row = map { $quote_field->($_, $sep) } @header_row; | |||||
| 282 | # print $fh join($sep, @header_row) . "\n"; | |||||
| 283 | ||||||
| 284 | # # 4. Process and Print Data Rows | |||||
| 285 | # for my $i (0 .. $max_rows) { | |||||
| 286 | # my @row_data; | |||||
| 287 | # | |||||
| 288 | # foreach my $col (@headers) { | |||||
| 289 | # push @row_data, defined($data->{$col}[$i]) ? $data->{$col}[$i] : 'NA'; | |||||
| 290 | # } | |||||
| 291 | # | |||||
| 292 | # if ($inc_rownames) { | |||||
| 293 | # # Bug 5 fix: use named column value if row.names is a column name | |||||
| 294 | # my $rn_val = defined $rownames_col | |||||
| 295 | # ? (defined $data->{$rownames_col}[$i] ? $data->{$rownames_col}[$i] : 'NA') | |||||
| 296 | # : $i + 1; | |||||
| 297 | # unshift @row_data, $rn_val; | |||||
| 298 | # } | |||||
| 299 | # | |||||
| 300 | # my @quoted = map { $quote_field->($_, $sep) } @row_data; | |||||
| 301 | # print $fh join($sep, @quoted) . "\n"; | |||||
| 302 | # } | |||||
| 303 | # } elsif ($is_aoh) { | |||||
| 304 | # my @headers; | |||||
| 305 | # if (defined $col_names) { # Bug 6 fix: was "if ($col_names)" â use defined | |||||
| 306 | # @headers = @$col_names; | |||||
| 307 | # } else { | |||||
| 308 | # my %col_map; | |||||
| 309 | # for my $row_hash (@$data) { | |||||
| 310 | # $col_map{$_} = 1 for keys %$row_hash; | |||||
| 311 | # } | |||||
| 312 | # @headers = sort keys %col_map; | |||||
| 313 | # } | |||||
| 314 | ||||||
| 315 | # # Bug 5 fix: if row.names is a column name (non-numeric string), pull it out to be first | |||||
| 316 | # my $rownames_col; | |||||
| 317 | # if ($inc_rownames && $inc_rownames =~ /\D/) { | |||||
| 318 | # $rownames_col = $inc_rownames; | |||||
| 319 | # @headers = grep { $_ ne $rownames_col } @headers; | |||||
| 320 | # } | |||||
| 321 | ||||||
| 322 | # my @header_row = @headers; | |||||
| 323 | # unshift @header_row, "" if $inc_rownames; | |||||
| 324 | # @header_row = map { $quote_field->($_, $sep) } @header_row; | |||||
| 325 | # print $fh join($sep, @header_row) . "\n"; # Bug 7 fix: was "say $fh" â use print consistently | |||||
| 326 | # for my $i (0 .. $#$data) { | |||||
| 327 | # my $row_hash = $data->[$i]; | |||||
| 328 | # my @row_data = map { defined $row_hash->{$_} ? $row_hash->{$_} : "NA" } @headers; | |||||
| 329 | # if ($inc_rownames) { | |||||
| 330 | # # Bug 5 fix: use named column value if row.names is a column name | |||||
| 331 | # my $rn_val = defined $rownames_col | |||||
| 332 | # ? (defined $row_hash->{$rownames_col} ? $row_hash->{$rownames_col} : 'NA') | |||||
| 333 | # : $i + 1; | |||||
| 334 | # unshift @row_data, $rn_val; | |||||
| 335 | # } | |||||
| 336 | # my @quoted = map { $quote_field->($_, $sep) } @row_data; | |||||
| 337 | # print $fh join($sep, @quoted) . "\n"; | |||||
| 338 | # } | |||||
| 339 | # } | |||||
| 340 | # close $fh; # Bug 8 fix: file handle was never closed | |||||
| 341 | #} | |||||
| 342 | 1; | |||||
| 343 | =encoding utf8 | |||||
| 344 | ||||||
| 345 - 848 | =head1 Synopsis
Get basic R statistical functions working in Perl as if they were part of List::Util, like C<min>, C<max>, C<sum>, etc.
I've used Artificial Intelligence tools such as Claude, Gemini, and Grok to write this as well as using my own gray matter.
There are other similar tools on CPAN, but I want speed and a form like List::Util, which I've gotten here with the help of AI, which often required many attempts to do correctly.
This is meant to call subroutines directly through eXternal Subroutines (XS) for performance and portability.
There B<are> other modules on CPAN that can do B<PARTS> of this, but this works the way that I B<want> it to.
=head1 Functions/Subroutines
=head2 aov
aov(
{
yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
ctrl => [1, 1, 1, 0, 0, 0]
},
'yield ~ ctrl');
which returns
{
ctrl {
Df 1,
"F value" 25.6000000000001,
"Mean Sq" 1.70666666666667,
Pr(>F) 0.00718232855871859,
"Sum Sq" 1.70666666666667
},
Residuals {
Df 4,
"Mean Sq" 0.0666666666666665,
"Sum Sq" 0.266666666666666
}
}
You can also perform Two-Way ANOVA with categorical interactions using the C<*> operator. The parser will implicitly evaluate the main effects alongside the interaction:
my $res_2way = aov($data_2way, 'len ~ supp * dose');
It is robust against rank deficiency; collinear terms will gracefully receive 0 degrees of freedom and 0 sum of squares, matching R's behavior.
=head2 chisq_test
my @test_data = ([762, 327, 468], [484, 239, 477]);
my $test_data = chisq_test(\@test_data);
which outputs:
{
data.name "Perl ArrayRef",
expected [
[0] [
[0] 703.671381936888,
[1] 319.645266594124,
[2] 533.683351468988
],
[1] [
[0] 542.328618063112,
[1] 246.354733405876,
[2] 411.316648531012
]
],
method "Pearson's Chi-squared test",
observed [
[0] [
[0] 762,
[1] 327,
[2] 468
],
[1] [
[0] 484,
[1] 239,
[2] 477
]
],
p.value 2.95358918321176e-07,
parameter {
df 2
},
statistic {
X-squared 30.0701490957547
}
}
It also supports 1D arrays for Goodness of Fit tests:
my $chisq_1d = chisq_test([10, 20, 30]);
For 2x2 matrices, Yates' Continuity Correction is applied automatically, exactly like in R.
=head2 cor
cor($array1, $array2, $method = 'pearson'),
that is, C<pearson> is the default and will be used if C<$method> is not specified.
Just like R, C<pearson>, C<spearman>, and C<kendall> are available
If you provide an array of arrays (a matrix), C<cor> will compute the correlation matrix automatically.
=head2 cor_test
my $result = cor_test(
'x' => $x,
'y' => $y,
alternative => 'two.sided'
method => 'pearson',
continuity => 1
);
C<cor_test> safely handles C<undef> (or C<NA>) values seamlessly by computing over pairwise complete observations.
=head2 cov
cov($array1, $array2, 'pearson')
or
cov($array1, $array2, 'spearman')
or
cov($array1, $array2, 'kendall')
=head2 fisher_test
=head3 array reference entry
my $array_data = [
[10, 2],
[3, 15]
];
my $res1 = fisher_test($array_data);
which returns a hash reference:
{
alternative "two.sided",
conf_int [
[0] 2.75338278824932,
[1] 301.462337971516
],
estimate {
"odds ratio" 21.3053175567504
},
method "Fisher's Exact Test for Count Data",
p_value 0.00053672411914343
}
=head3 hash reference entry
$ft = fisher_test( {
Guess => {
Milk => 3, Tea => 1
},
Truth => {
Milk => 1, Tea => 3
}
});
I have the p-value calculated very precisely, but there are some inexactness (approximately 1% for the confidence intervals) which I couldn't rectify. The answers are very close to R besides the p-value, where they are identical.
=head2 glm
takes a hash of an array as input
my %tooth_growth = (
dose => [qw(0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0
2.0 2.0 2.0)],
len => [qw(4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0 16.5 16.5 15.2 17.3 22.5
17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5
15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7 19.7 23.3 23.6 26.4 20.0
25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0)],
supp => [qw(VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ)]
);
my $glm_teeth = glm(
data => \%tooth_growth,
formula => 'len ~ dose + supp',
family => 'gaussian'
);
I'm not completely confident that this is working perfectly, though I've gotten this subroutine to work for simple cases.
In addition to the C<gaussian> default, it fully supports logistic regression using the C<binomial> family parameter via Iteratively Reweighted Least Squares (IRLS):
my $glm_bin = glm(formula => 'am ~ wt + hp', data => \%mtcars, family => 'binomial');
=head2 hist
Computes the histogram of the given data values, operating in single $O(N)$ pass performance. It returns the bin counts, computed breaks, midpoints, and density.
my $res = hist([1, 2, 2, 3, 3, 3, 4, 4, 5], breaks => 4);
If C<breaks> is not explicitly provided, it defaults to calculating the number of bins using Sturges' formula.
=head2 kruskal_test
Essentially the test determines if all groups have the same median (same distribution) (an excellent review is at https://library.virginia.edu/data/articles/getting-started-with-the-kruskal-wallis-test)
Performs a Kruskal-Wallis rank sum test, see
https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/kruskal.test
=head3 R-like array entry
my @xk = (2.9, 3.0, 2.5, 2.6, 3.2); # normal subjects
my @yk = (3.8, 2.7, 4.0, 2.4); # with obstructive airway disease
my @zk = (2.8, 3.4, 3.7, 2.2, 2.0); # with asbestosis
my @x = (@xk, @yk, @zk);
my @g = (
(map {'Normal subjects'} 0..4),
(map {'Subjects with obstructive airway disease'} 0..3),
map {'Subjects with asbestosis'} 0..4
);
my $t0 = Time::HiRes::time();
my $kt = kruskal_test(\@x, \@g);
my $t1 = Time::HiRes::time();
printf("Kruskal calculation in %g seconds.\n", $t1-$t0);
p $kt;
=head3 hash of array entry
my %x = (
'normal.subjects' => [2.9, 3.0, 2.5, 2.6, 3.2],
'obs. airway disease' => [3.8, 2.7, 4.0, 2.4],
'asbestosis' => [2.8, 3.4, 3.7, 2.2, 2.0]
);
$t0 = Time::HiRes::time();
$kt = kruskal_test(\%x);
$t1 = Time::HiRes::time();
printf("Kruskal calculation via HoA in %g seconds.\n", $t1-$t0);
p $kt;
=head2 ks_test
The Kolmogorov-Smirnov test, which tests whether or not two arrays/lists of data are part of the same distribution is implemented simply:
$ks = ks_test(\@x, \@y, alternative => 'greater');
returning a hash reference.
Also, a single array can be tested against a normal distribution:
$ks = ks_test($ksx, 'pnorm');
The p-value precision is about 1e-8, which I want to improve, but am not sure how.
=head2 lm
This is the linear models function.
$lm = lm(formula => 'mpg ~ wt + hp', data => $mtcars);
where C<$mtcars> is a hash of hashes
C<lm> also supports generating interaction terms directly within the formula using the C<*> operator:
my $lm = lm(formula => 'mpg ~ wt * hp^2', data => \%mtcars);
If your data contains missing numbers (C<NA> or C<undef>), C<lm> handles listwise deletion dynamically to ensure mathematical integrity before fitting.
the dot operator also works:
$lm = lm(formula => 'y ~ .', data => $dot_data);
=head2 matrix
my $mat1 = matrix(
data => [1..6],
nrow => 2
);
You can also pass C<< byrow =E<gt> 1 >> if you want the matrix populated row-wise instead of column-wise.
=head2 mean
mean(1,2,3);
or
my @arr = 1..8;
mean(@arr, 4, 5)
or
mean([1,1], [2,2]) # 1.5
=head2 median
works like mean, taking array references and arrays:
median( $test_data[$i][0] )
=head2 p_adjust
Returns array of false-discovery-rate-corrected p-values, where methods available are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr"
my @q = p_adjust(\@pvalues, $method);
=head2 power_t_test
$test_data = power_t_test(
n => 30, delta => 0.5,
sd => 1.0, sig_level => 0.05
);
It also allows configuring the test type (C<< type =E<gt> 'one.sample' >>, C<'two.sample'>, C<'paired'>) and alternative hypothesis (C<< alternative =E<gt> 'one.sided' >>). You can also pass C<< strict =E<gt> 1 >> to strictly evaluate both tails of the distribution.
=head2 quantile
Calculates sample quantiles using R's continuous Type 7 interpolation.
my $quantile = quantile('x' => [1..99], probs => [0.05, 0.1, 0.25]);
If the C<probs> parameter is omitted, it behaves identically to R by defaulting to the 0, 25, 50, 75, and 100 percentiles (C<c(0, .25, .5, .75, 1)>). The returned hash keys match R's standardized naming convention (e.g., C<"25%">, C<"33.3%">).
=head2 rbinom
Create a binomial distribution of numbers
my $binom = rbinom( n => $n, prob => 0.5, size => 9);
It hooks directly into Perl's internal PRNG system, respecting C<srand()> seeds.
=head2 read_table
I've tried to make this as simple as possible, trying to follow from R:
my $test_data = read_table('t/HepatitisCdata.csv');
output types can be AOH (aoa), HOA (hoa), HOH (hoh)
read_table($filename, 'output.type' => 'aoh');
read_table($filename, 'output.type' => 'hoa');
and, like Text::CSV_XS, filters can be applied in order to save RAM on big files:
$test_data = read_table(
't/HepatitisCdata.csv',
filter => {
Sex => sub {$_ eq 'f'} # where "Sex" is the column name, and "$_" is the value for that column
},
'output.type' => 'aoh'
);
=head2 rnorm
Make a normal distribution of numbers, with pre-set mean C<mean>, standard deviation C<sd>, and number C<n>.
my ($rmean, $sd, $n) = (10, 2, 9999);
my $normals = rnorm( n => $n, mean => $rmean, sd => $sd);
=head2 runif
=head3 named arguments
Make a distribution of approximately uniform distribution
my $unif = runif( n => $n, min => 0, max => 1);
where C<n> is the number of items, the values are between C<min> and C<max>
=head3 positional args
this is to match R's behavior:
runif( 9 )
will make 9 numbers in [0,1]
runif(9, 0, 99)
will match C<n>, C<min>, and C<max> respectively
=head2 scale
my @scaled_results = scale(1..5);
You can also pass an options hash to disable centering or scaling:
my @scaled_results = scale(1..5, { center => false, scale => true });
It fully supports matrix operations. By passing an array of arrays, C<scale> processes the data column by column independently:
my $scaled_mat = scale([[1, 2], [3, 4], [5, 6]]);
=head2 sd
my $stdev = sd(2,4,4,4,5,5,7,9);
Correct answer is 2.1380899352994;
=head2 seq
Works as closely as I can to R's seq, which is very similar to Perl's C<for> loops. Returns an array, not an array reference.
=head3 Example 1: Standard integer sequence
say 'seq(1, 5):';
my @seq = seq(1, 5);
say join(', ', @seq), "\n";
say 'seq(1, 2, 0.25):';
@seq = seq(1, 2, 0.25);
=head3 Example 2: Fractional steps
say 'seq(1, 2, 0.25):';
@seq = seq(1, 2, 0.25);
say join(", ", @seq), "\n";
for (my $idx = 2; $idx >= 1; $idx -= 0.25) { # count down to pop
is_approx(pop @seq, $idx, "seq item $idx with fractional step");
}
=head3 Example 3: Negative steps
say 'seq(10, 5, -1):';
@seq = seq(10, 5, -1);
say join(", ", @seq), "\n";
for (my $idx = 5; $idx <= 10; $idx++) { # count down to pop
is_approx(pop @seq, $idx, "seq item $idx with negative step");
}
=head2 shapiro_test
tests to see if an array reference is normally distributed, returns a p-value and a statistic
my $shapiro = shapiro_test(
[1..5]
);
and returns the hash reference:
{
p.value 0.589650577093106,
p_value 0.589650577093106,
statistic 0.960870680168535,
W 0.960870680168535
}
=head2 t_test
There are 1-sample and 2-sample t-tests:
my $t_test = t_test( $test_data[$i][$j], mu => mean( $test_data[$i][$j] ));
or 2-sample:
$t_test = t_test(
$test_data[3][0],
$test_data[3][1],
paired => true
);
returns a hash reference, which looks like:
conf_int => [
-0.06672889, 0.25672889
],
df => 5,
estimate => 0.095,
p_value => 0.19143688433660,
statistic => 1.50996688705414
the two groups compared can be specified, though not necessarily, as C<x> and C<y>, just like in R:
$t_test = t_test(
'x' => test_data[3][0],
'y' => $test_data[3][1],
paired => true
);
=head2 var
as simple as possible:
var(2, 4, 5, 8, 9)
=head2 wilcox_test
$test_data = wilcox_test(
[1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30],
[0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29]
);
It fully supports paired tests (C<< paired =E<gt> true >>) and can calculate exact p-values (the default for $N < 50$ without ties). If ties are encountered, it automatically switches to an approximation with continuity correction.
=head2 write_table
mimics R's "write.table", with data as first argument to subroutine, and output file as second
write_table(\@data_aoh, $tmp_file, sep => "\t", 'row.names' => true);
You can also precisely filter and reorder which columns are written by passing an array reference to C<col.names>:
write_table(\@data, $tmp_file, sep => "\t", 'col.names' => ['c', 'a']); | |||||