"""Kernel density estimation via diffusion for 1-dimensional data."""######################################### Dependencies #########################################fromnumpyimportarray,arangefromnumpyimportexp,sqrt,piasπfromnumpyimportceil,log2fromnumpyimportprodasproductfromnumpyimporthistogramfromscipy.fftimportdct,idctfromscipy.optimizeimportbrentq######################################### Main #########################################
[docs]defkde1d(x,n=1024,limits=None):""" Estimates the 1d density from discrete observations. The input is a list/array `x` of numbers that represent discrete observations of a random variable. They are binned on a grid of `n` points within the data `limits`, if specified, or within the limits given by the values' range. `n` will be coerced to the next highest power of two if it isn't one to begin with. The limits may be given as a tuple (`xmin`, `xmax`) or a single number denoting the upper bound of a range centered at zero. If any of those values are `None`, they will be inferred from the data. After binning, the function determines the optimal bandwidth according to the diffusion-based method. It then smooths the binned data over the grid using a Gaussian kernel with a standard deviation corresponding to that bandwidth. Returns the estimated `density` and the `grid` upon which it was computed, as well as the optimal `bandwidth` value the algorithm determined. Raises `ValueError` if the algorithm did not converge. """# Convert to array in case a list is passed in.x=array(x)# Round up number of bins to next power of two.n=int(2**ceil(log2(n)))# Determine missing data limits.iflimitsisNone:xmin=xmax=Noneelifisinstance(limits,tuple):(xmin,xmax)=limitselse:xmin=-limitsxmax=+limitsifNonein(xmin,xmax):delta=x.max()-x.min()ifxminisNone:xmin=x.min()-delta/10ifxmaxisNone:xmax=x.max()+delta/10# Determine data range, required for scaling.Δx=xmax-xmin# Determine number of data points.N=len(x)# Bin samples on regular grid.(binned,edges)=histogram(x,bins=n,range=(xmin,xmax))grid=edges[:-1]# Compute 2d discrete cosine transform, then adjust first component.transformed=dct(binned/N)transformed[0]/=2# Pre-compute squared indices and transform components before solver loop.k=arange(n,dtype='float')# "float" avoids integer overflow.k2=k**2a2=(transformed/2)**2# Define internal function to be solved iteratively.defξγ(t,l=7):"""Returns ξ γ^[l] as a function of diffusion time t."""f=2*π**(2*l)*sum(k2**l*a2*exp(-π**2*k2*t))forsinrange(l-1,1,-1):K=product(range(1,2*s,2))/sqrt(2*π)C=(1+(1/2)**(s+1/2))/3t=(2*C*K/N/f)**(2/(3+2*s))f=2*π**(2*s)*sum(k2**s*a2*exp(-π**2*k2*t))return(2*N*sqrt(π)*f)**(-2/5)# Solve for optimal diffusion time t*.try:ts=brentq(lambdat:t-ξγ(t),0,0.1)exceptValueError:raiseValueError('Bandwidth optimization did not converge.')fromNone# Apply Gaussian filter with optimized kernel.smoothed=transformed*exp(-π**2*ts/2*k**2)# Reverse transformation after adjusting first component.smoothed[0]*=2inverse=idct(smoothed)# Normalize density.density=inverse*n/Δx# Determine bandwidth from diffusion time.bandwidth=sqrt(ts)*Δx# Return results.return(density,grid,bandwidth)