Brianboonstra
This C code implements some good and bad ways of calculating correlation between two series, demonstrating the errors arising from less stable implementations.
#include <math.h>
#include <stdio.h>
double testx[]={
9000001.1555276505345677,
9000001.068721046709149,
9000000.96518393946084069,
9000001.1531960437260851,
9000000.88453293874926076,
9000000.67879633024245611,
9000000.14404772017576237,
9000000.18702772133503412,
9000000.21783831471783008,
9000000.23032683921940947,
9000000.23865790920255828,
9000000.34601225320692602,
9000000.19733610902394347,
9000000.23696940408215056,
9000000.25111312020814941,
9000000.24417247897843469,
9000000.26694520385052928,
9000000.34899657977662069};
double testy[]={9000000.31996861636309032,
9000000.34890433638288781,
9000000.37845911905246998,
9000000.415291278041722,
9000000.37069724208731397,
9000000.37429838187159403,
9000000.47751648326269014,
9000000.49707750364699393,
9000000.51272613811181433,
9000000.49233095309358993,
9000000.45823600496928935,
9000000.59001439623101459,
9000000.88994026880913846,
9000001.0479068232871365,
9000001.0424618549497753,
9000000.97737636575118603,
9000000.77763295952982336,
9000000.64632136445519006};
double unstablecorrel1(double x[], double y[], int n) /* Classical one-pass */
{
int i;
double prod=0., sumx=0., sumy=0., sumxSq=0., sumySq=0.;
double r;
for (i=0;i<n;++i) {
prod += x[i]*y[i];
sumx += x[i];
sumy += y[i];
sumxSq += x[i]*x[i];
sumySq += y[i]*y[i];
}
r = (n*prod-sumx*sumy)/(sqrt(n*sumxSq-sumx*sumx)*sqrt(n*sumySq-sumy*sumy));
return r;
}
double unstablecorrel2(double x[], double y[], int n) /*Admdikramr version */
{
int i;
double prod=0., sumx=0., sumy=0., sumxSq=0., sumySq=0.;
double r;
for (i=0;i<n;++i) {
prod += x[i]*y[i];
sumx += x[i];
sumy += y[i];
sumxSq += x[i]*x[i];
sumySq += y[i]*y[i];
}
r = (prod/n-sumx/n*sumy/n)/(sqrt(sumxSq/n-(sumx/n)*(sumx/n))*sqrt(sumySq/n-(sumy/n)*(sumy/n)));
return r;
}
float floatgoodcorrel(float x[], float y[], int n)
{
float pop_sd_x, pop_sd_y, cov_x_y, correlation, sweep,delta_x, delta_y;
float sum_sq_x = 0;
float sum_sq_y = 0;
float sum_coproduct = 0;
float mean_x = x[0];
float mean_y = y[0];
int i;
for (i=2;i<=n;++i) {
sweep = (i - 1.0) / i;
delta_x = x[i-1] - mean_x;
delta_y = y[i-1] - mean_y;
sum_sq_x += delta_x * delta_x * sweep;
sum_sq_y += delta_y * delta_y * sweep;
sum_coproduct += delta_x * delta_y * sweep;
mean_x += delta_x / i;
mean_y += delta_y / i ;
}
pop_sd_x = sqrt( sum_sq_x / n );
pop_sd_y = sqrt( sum_sq_y / n );
cov_x_y = sum_coproduct / n;
correlation = cov_x_y / (pop_sd_x * pop_sd_y);
return correlation;
}
double goodcorrel(double x[], double y[], int n)
{
double pop_sd_x, pop_sd_y, cov_x_y, correlation, sweep,delta_x, delta_y;
double sum_sq_x = 0;
double sum_sq_y = 0;
double sum_coproduct = 0;
double mean_x = x[0];
double mean_y = y[0];
int i;
for (i=2;i<=n;++i) {
sweep = (i - 1.0) / i;
delta_x = x[i-1] - mean_x;
delta_y = y[i-1] - mean_y;
sum_sq_x += delta_x * delta_x * sweep;
sum_sq_y += delta_y * delta_y * sweep;
sum_coproduct += delta_x * delta_y * sweep;
mean_x += delta_x / i;
mean_y += delta_y / i ;
}
pop_sd_x = sqrt( sum_sq_x / n );
pop_sd_y = sqrt( sum_sq_y / n );
cov_x_y = sum_coproduct / n;
correlation = cov_x_y / (pop_sd_x * pop_sd_y);
return correlation;
}
int main()
{
int i,n=18;
float fx[18],fy[18];
double dx[18],dy[18];
printf("Easy case\n");
for (i=0;i<n;++i) {
dx[i]=testx[i]-9000000.0;
dy[i]=testy[i]-9000000.0;
fx[i]=testx[i]-9000000.0;
fy[i]=testy[i]-9000000.0;
}
printf("%.16f\n", floatgoodcorrel(fx,fy,n));
printf("%.16f\n", unstablecorrel1(dx,dy,n));
printf("%.16f\n", unstablecorrel2(dx,dy,n));
printf("%.16f\n", goodcorrel(dx,dy,n));
printf("Difficult case (shifted means)\n");
for (i=0;i<n;++i) {
fx[i]=testx[i];
fy[i]=testy[i];
}
printf("%.16f\n", floatgoodcorrel(fx,fy,n));
printf("%.16f\n", unstablecorrel1(testx,testy,n));
printf("%.16f\n", unstablecorrel2(testx,testy,n));
printf("%.16f\n", goodcorrel(testx,testy,n));
}
Watch it happen
editMight I suggest an extension? Suppose X is a set of test values, such as X = {3,1,4,1,5,9,2} or similar - actually, {1,2,3,4,5,6} has merit, as sums, sums of squares, etc. can be calculated mathematically, with ease. For test purposes, rather than trying just (9000000 + X) as above, run the algorithm with (X), then (10 + X), then (100 + X), then (1000 + X), etc. and watch the various calculations fall apart. Try also in single and double and quadruple precision. NickyMcLean (talk) 04:04, 6 April 2009 (UTC)
algorithm -- how did you come to it?
editHi, I believe that you're the author of the algorithm for one pass linear correaltion coefficient? At least you contributed it here? I beleive you referenced "Elements of Statistical Computing: Numerical computation," by Ronald Aaron Thisted, the pages about sweep operator, which I've found (partially) in Google books, still what's there is a lot of theory with matrices I don't even see how that maps to the problem in question (which has two vectors as input). So as far as I understand your algorithm is still something untrivial: for example "Numerical Recepies" book does 2 passes. They keep stability by using differencies but they didn't reach elegancy you showed. Is there a chance that you write somewhere else (a you'd have problems adding it to the wikipedia article directly) in more details about that particular algorithm? Thank you. Janjs (talk) 17:43, 17 June 2009 (UTC)
- I'll try to write something up. For now, I should just clarify the translation of all that matrix algebra to this algorithm. Computing correlation is a degenerate case of the use of the sweep operator. So if you implement something like the formulas found in Thisted (many books have equivalents) and then eliminate all the unused bits, this is what you end up with.Brianboonstra (talk) 13:47, 18 August 2009 (UTC)
Unblock request
editBrianboonstra (block log • active blocks • global blocks • contribs • deleted contribs • filter log • creation log • change block settings • unblock • checkuser (log))
Request reason:
Caught by a colocation web host block but this host or IP is not a web host. Behind corporate firewall. Brianboonstra (talk) 13:32, 17 April 2018 (UTC)
Decline reason:
Procedural decline only. You forgot to tell us your IP address so we can't investigate your claim. You can find this using WhatIsMyIP. If you don't wish to provide this publicly, you may use WP:UTRS to provide the IP address privately. Yamla (talk) 14:15, 17 April 2018 (UTC)
If you want to make any further unblock requests, please read the guide to appealing blocks first, then use the {{unblock}} template again. If you make too many unconvincing or disruptive unblock requests, you may be prevented from editing this page until your block has expired. Do not remove this unblock review while you are blocked.
The IP is from Forcepoint Cloud Services and is hardblocked.
— Berean Hunter (talk) 15:29, 18 April 2018 (UTC)
Hi. We're into the last five days of the Women in Red World Contest. There's a new bonus prize of $200 worth of books of your choice to win for creating the most new women biographies between 0:00 on the 26th and 23:59 on 30th November. If you've been contributing to the contest, thank you for your support, we've produced over 2000 articles. If you haven't contributed yet, we would appreciate you taking the time to add entries to our articles achievements list by the end of the month. Thank you, and if participating, good luck with the finale!