;---------------------------------------------------------------------- ;this subroutine needed for solving for c50 (inner quartile) ;---------------------------------------------------------------------- function newton_c50, c50 ;value of p is transferred via common common newton_c50_common,pcom p=pcom ;the following two are equivalent: ;using IDL cumulative function of normalized gaussian func=(1.-p)*(gauss_pdf(c50)-gauss_pdf(-c50))+$ p*(gauss_pdf(c50/3.)-gauss_pdf(-c50/3.))$ -0.5 ;error-function func=(1-p)*erf(c50/sqrt(2.))+$ p*erf(c50/sqrt(2.)/3.)$ -0.5 return,func end ;---------------------------------------------------------------------- ;main ;---------------------------------------------------------------------- common newton_c50_common,pcom program='exe2_gaussian' ps=.1 ;Illustrate the effect of outliers, in terms of a superposition of ;two Gaussians, with standard deviations sigma=1 and sigma=3 ;(variance sigma^2=1 and 9) ;Weight of the sigma=3 distribution (=outliers) given by paramater p ; g1=1./sqrt(2*!pi)*exp(-0.5*x^2) ; g2=1./sqrt(2*!pi)/3.*exp(-0.5*x^2/3.^2) ; g=(1-p)*g1 + p*g2 ;----------------------------------------------------------- ;create distributions with random numbers ;----------------------------------------------------------- psdirect,program+'_a',ps,/color !p.multi=[0,1,2] nwin ;-------------------------- ;first for p=0.05 N=1000000l ; number of points p=0.05 N1=(1.-p)*N N2=p*N X1=[randomn(seed,N1),3*randomn(seed,N2)] ;plot results with histo_f procedure: ; x1=minimum of tabulation ; x2=maximum of tabulation ; dx=binsize of tabulation ; xbin = centers of bins ; ybin = number of points in the bin ; /noscale keyword -> ; histo_f returns yt =ybin ; probability density pdf = yt/(dx*N) xmin=-10. xmax= 10. dx=0.05 histo_f,X1,xmin,xmax,dx,xt1,yt1,/noscale pdf1=1.*yt1/(n_elements(x1)*dx) plot,xt1,pdf1,psym=10,xtitle='x',ytitle='pdf',$ title='Two gaussians: effect of outliers' ;------------------------------------------------ ;similarly for p=0.1 and p=0.2 p=0.10 N1=(1.-p)*N N2=p*N X2=[randomn(seed,N1),3*randomn(seed,N2)] histo_f,X2,xmin,xmax,dx,xt2,yt2,/noscale pdf2=1.*yt2/(n_elements(x2)*dx) oplot,xt2,pdf2,col=2,lines=2 p=0.20 N1=(1.-p)*N N2=p*N X3=[randomn(seed,N1),3*randomn(seed,N2)] histo_f,X3,xmin,xmax,dx,xt3,yt3,/noscale pdf3=1.*yt3/(n_elements(x3)*dx) oplot,xt3,pdf3,col=3,lines=1 xyouts,0.15,.9,'pdf=(1-p)*Gauss(0,1)+p*Gauss(0,3^2)',/normal label_data,0.67,.9,['p=0.05','p=0.1','p=0.2'],lines=[0,2,1],col=[1,2,3] ;------------------------------------------------------------- ;plot the same in logarithmic scale ;show also the theoretical functions nwin plot,xt1,pdf1,psym=10,/ylog,yr=[1d-5,1] oplot,xt2,pdf2,psym=10,col=2,lines=2 oplot,xt3,pdf3,psym=10,col=3,lines=1 ;theoretical for p=0.05 xarg=xt1 p=0.05 g1=1./sqrt(2*!pi)*exp(-0.5*xarg^2) g2=1./sqrt(2*!pi)/3.*exp(-0.5*xarg^2/3.^2) oplot,xarg,(1.-p)*g1 oplot,xarg,p*g2 oplot,xarg,(1.-p)*g1+p*g2,lines=0 !p.multi=0 psdirect,program+'_a',ps,/stop ;------------------------------------------------ ;compare RMS, mean deviation, and inner quartile width ;as a measure of the width of the distribution ;------------------------------------------------ psdirect,program+'_b',ps ;------------ ;Analytically ;------------ ;for values of p from 0 to 0.3 parg=findgen(31)*.01 ;root-mean-square: integral of x^2g(x) RMS=sqrt((1.-parg)+parg*3.^2) ;mean (absolute) deviation: integral of abs(x)g(x) MAD=2./sqrt(2.*!pi)*((1.-parg)*1. +parg/3.*9.) ;interquartile width c50: integral form -c50 to c50 g(x) equals 0.5 ;need to solve numerically (G is the cumulative distribution of gaussian) ; (1-p)*(G1(c50)-G1(-c50))+p*(G2(c50)-G2(-c50))=0.5 ;use Newton's method to find the root of ; f(c50;p)=(1-p)*(G1(c50)-G1(-c50))+p*(G2(c50)-G2(-c50))-0.5 ;this function f(c50;p) is returned by ;the function-procedure newton_c50(c50) ; p is passed to the function via a common block c50=parg*0 for i=0,n_elements(parg)-1 do begin pcom=parg(i) ;passed via common to newton_c50 p_ini=0. ;initial guess for iteration c50(i)=newton(p_ini,'newton_c50') endfor nwin plot,parg,rms,lines=0,xtitle='p',ytitle='width of the distribution' oplot,parg,mad,lines=2 oplot,parg,c50,lines=1 ;-------------------------------------------- ;numerically from the generated distributions ;------------;------------;------------;----- ;for p=0.05,0.1,0.2 for ip=1,3 do begin if(ip eq 1) then begin p=0.05 pdf=pdf1 x=x1 xt=xt1 endif if(ip eq 2) then begin p=0.1 pdf=pdf2 x=x2 xt=xt2 endif if(ip eq 3) then begin p=0.2 pdf=pdf3 x=x3 xt=xt3 endif ;directly from array of x rms=sqrt(mean(x^2)) mad=mean(abs(x)) x=x(sort(x)) c50=0.5*(x(N*0.75)-x(N*0.25)) plots,[p,rms],psym=6,col=ip plots,[p,mad],psym=6,col=ip plots,[p,c50],psym=6,col=ip ;from the tabulated pdf's rms=sqrt(mean(xt^2*pdf)/mean(pdf)) mad=mean(abs(xt)*pdf)/mean(pdf) sum=pdf*0. for ii=1,n_elements(xt)-1 do begin sum(ii)=sum(ii-1)+pdf(ii)*dx endfor ind=where(sum gt 0.25) ind25=ind(0) ;0.25 attained between xt(ind25) and xt(ind25-1) ;weigted mean c50m=(sum(ind25-1)*xt(ind25)+sum(ind25)*xt(ind25-1))/$ (sum(ind25)+sum(ind25-1))+0.5*dx ind=where(sum gt 0.75) ind75=ind(0) ;0.75 attained between xt(ind75) and xt(ind75-1) ;weigted mean c50p=(sum(ind75-1)*xt(ind75)+sum(ind75)*xt(ind75-1))/$ (sum(ind75)+sum(ind75-1))+0.5*dx print,c50m,c50p c50=(c50p-c50m)/2. plots,[p,rms],psym=2,col=ip plots,[p,mad],psym=2,col=ip plots,[p,c50],psym=2,col=ip endfor label_data,0.1,.9,['RMS','MD','C50'],lines=[0,2,1] psdirect,program+'_b',ps,/stop end