# -*- coding: utf8 -*-
"""Statistical tools"""
# Copyright or © or Copr. Actimar/IFREMER (2010-2015)
#
# This software is a computer program whose purpose is to provide
# utilities for handling oceanographic and atmospheric data,
# with the ultimate goal of validating the MARS model from IFREMER.
#
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
import os
import numpy as N,numpy.ma as MA,MV2
import cdms2
import cdtime
from genutil.statistics import percentiles
from vacumm import VACUMMError
from vacumm.misc.axes import isaxis, create_time
from vacumm.misc.grid import get_grid, set_grid
from vacumm.misc import MV2_axisConcatenate, set_atts, cp_atts
from vacumm.misc.io import netcdf4
from vacumm.misc.atime import comptime, reltime
__all__ = ['corr_proba', 'ensrank', 'qtmin', 'qtmax', 'qtminmax',
'StatAccum', 'StatAccumError']
__all__.sort()
[docs]def corr_proba(r, ndata, ndataset=2, dof=False):
"""Probability of rejecting correlations
- **r**: Correlation coefficient
- **ndata**: Number of records use for correlations
- **ndataset**, optional: Number of datasets (1 for autocorrelations, else 2) [default: 2]
.. todo::
This must be rewritten using :mod:`scipy.stats`
"""
# TODO: use scipy for betai and _gamma?
from genutil.salstat import betai,_gammaln
# Basic tests
ndata = MA.masked_equal(ndata,0,copy=0)
r = MV2.masked_where(MA.equal(MA.absolute(r),1.),r,copy=0)
# Degree of freedom
if dof:
df = ndata
else:
df = ndata-2-ndataset
# Advanced test: prevent extreme values by locally decreasing the dof
reduc = N.ones(r.shape)
z = None
while z is None or MA.count(MA.masked_greater(z,-600.)):
if z is not None:
imax = MA.argmin(z.ravel())
reduc.flat[imax] += 1
dfr = df/reduc
t = r*MV2.sqrt(dfr/((1.0-r)* (1.0+r)))
a = 0.5*dfr
b = 0.5
x = df/(dfr+t**2)
z = _gammaln(a+b)-_gammaln(a)-_gammaln(b)+a*MA.log(x)+b*MA.log(1.0-x)
# Perfom the test and format the variable
prob = MV2.masked_array(betai(a,b,x),axes=r.getAxisList())*100
prob.id = 'corr_proba' ; prob.name = prob.id
prob.long_name = 'Probability of rejection'
prob.units = '%'
return prob
[docs]def ensrank(obs, ens, gethist=False, getnrz=False, centered=False):
"""Compute the rank of a reference (observations) in an ensemble
If ``nens`` is the size of the ensemble,
the rank may go from ``0`` to ``nens``.
- **obs**, (...): Observation array
- **ens**, (nens,...): Ensemble array
- **getrank**, optional: If True, return also the rank histogram
- **getnrz**, optional: If True, return also non recovery zones
- **centered**, optional: Center the ensemble data on the observations
before computing the rank
"""
# Check
ens = MV2.asarray(ens)
obs = MV2.asarray(obs)
nens = ens.shape[0]
assert nens>3, 'Your ensemble is too small (nens=%i)'%nens
# Inits
rank = obs.clone()
rank.id = 'rank'
rank.long_name = 'Rank'
if hasattr(rank, 'units'): del rank.units
nrz = obs.clone()
nrz.id = 'nrz'
nrz.long_name = 'Non recovery zones'
nrz[:] = MV2.masked
# Sort ensemble at each pixel and compute the min biais
bias = MV2.sort(ens, axis=0)-obs
if centered:
bias -= bias.mean(axis=0)
nrz[:] = MV2.where((bias[-1]<0.).astype('b'), bias[-1], nrz)
nrz[:] = MV2.where((bias[0]>0.).astype('b'), bias[0], nrz)
# Index of member = rank
rank[:] = (bias<0).astype('i').sum(axis=0)
del bias
## Apply observation mask
#nrz[:] = MV2.masked_where(MV2.getmask(obs), nrz, copy=0)
#rank[:] = MV2.masked_where(MV2.getmask(obs), rank, copy=0)
# Rank histogram
if gethist:
if N.ma.isMA(rank): rank = rank.compressed()
hist = N.histogram(rank, N.arange(0,nens+1))
# Out
ret = rank,
if gethist:
ret += hist,
if getnrz:
ret += nrz,
return ret[0] if len(ret)==1 else ret
[docs]def qtminmax(var, qt=5.):
"""Get min and max using quantiles
This is useful for very asymmetic distributions
:Params:
- **var**: A numeric array.
- **qt**, optional: Percentile used to define min and max
- Single number: ``qt`` -> ``[qt, 100-qt]``
- Two elements: used directly.
"""
if N.ma.isMA(var): var = var.compressed()
if not hasattr(qt, '__len__'):
qt = [qt, 100-qt]
qt = N.sort(N.clip(N.asarray(qt), 0, 100.))
vmin, vmax = percentiles(var.ravel(), qt)
return vmin, vmax
[docs]def qtmin(var, qt=5.):
"""Get min max using quantiles
This is useful for very asymmetic distributions
:Params:
- **var**: A numeric array.
- **qt**, optional: Percentile used to define min
"""
if N.ma.isMA(var): var = var.compressed()
qt = N.clip([qt], 0, 100.)
vmin = percentiles(var.ravel(), qt)
return vmin
[docs]def qtmax(var, qt=95.):
"""Get max using quantiles
This is useful for very asymmetic distributions
:Params:
- **var**: A numeric array.
- **qt**, optional: Percentile used to define max
"""
if N.ma.isMA(var): var = var.compressed()
qt = N.clip([qt], 0, 100.)
vmax = percentiles(var.ravel(), qt)
return vmax
[docs]class StatAccumError(VACUMMError):
pass
[docs]class StatAccum(object):
"""Statistics accumulator
:Generic params:
- **t/sall**: Perform all the statistics by default.
- **withtime**, optional: Input does not contain a time dimension.
:Single:
- **tcount/scount**: Number of observations taken into account
- **tavail/savail**: Percentage of available observations
- **tmean/smean**: Temporal (t) / Spatial (s) average
- **tstd/sstd**: Temporal (t) / Spatial (s) std
:Dual:
- **tall**: Perform all the following statistics by default.
- **tbias/sbias**: Temporal (t) / Spatial (s) bias
- **trms/srms**: Temporal (t) / Spatial (s) RMS
- **tcrms/scrms**: Temporal (t) / Spatial (s) centered RMS
- **tcorr/scorr**: Temporal (t) / Spatial (s) Correlation
:Example:
>>> sa = StatAccum(tbias=True, srms=True)
>>> sa += ssta1, sstb1
>>> sa += ssta2, sstb2
>>> tbias = sa.get_tbias()
>>> srms = sa.get_srms()
"""
single_stats = 'mean', 'std', 'hist', 'min', 'max'
dual_stats = 'bias', 'rms', 'crms', 'corr', 'avail', 'count'
all_stats = single_stats + dual_stats
_single_accums = 'sum', 'sqr', 'min', 'max', 'hist'
_dual_accums = 'prod',
default_restart_file = 'stataccum_restart.nc'
def __init__(self,
tall=False, tavail=None, tmean=None, tstd=None, smax=None, smin=None,
tbias=None, trms=None, tcrms=None, tcorr=None, shist=None, tcount=None,
sall=False, savail=None, smean=None, sstd=None, tmax=None, tmin=None,
sbias=None, srms=None, scrms=None, scorr=None, thist=None, scount=None,
bins=None, restart_file=None, restart=False,
withtime=None):
# Restart?
if restart:
if restart_file is None and isinstance(restart, basestring):
restart_file = restart
self.restart_file = restart_file
self.load()
else:
for st in 'st':
activate = False
for stype in self.all_stats:
value = eval(st+stype)
if value is None: value = eval(st+'all')
setattr(self, st+stype, value)
activate |= value
setattr(self, st+'stats', activate)
self.iterindex = 0
self.lasttime = None
self.withtime = withtime
if self.thist or self.shist:
if bins is None:
if (self.thist and not self.tall) or (self.shist and not self.sall):
raise StatAccumError(
"You must set the 'bins' keyword to make histograms")
if self.thist and self.tall:
self.thist = False
if self.shist and self.sall:
self.shist = False
self._baxis = cdms2.createAxis(0.5*(bins[:-1]+bins[1:]), id='hbin')
self._baxis.long_name = 'Histogram bin centers'
self._baxis.setBounds(bins)
else:
self._baxis = None
self.bins = bins
self.nitems = 0
self.dual = None
self.restart_file = None
self.ns = self.nt = -1
def __len__(self):
return self.iterindex
[docs] def append(self, *items):
"""Append data to the accumulator"""
# Initialization
if self.iterindex==0:
# Two variables?
self.nitems = len(items)
self.dual = self.nitems>1
# Do we have time in input variables
if self.withtime is None:
self.withtime = items[0].getTime() is not None
# Flags
if not self.dual:
self.tbias = self.trms = self.tcrms = self.tcorr = False
self.sbias = self.srms = self.scrms = self.scorr = False
self.tstats = (self.tmean or self.tstd or self.tbias or self.trms or
self.tcrms or self.tavail or self.tmin or self.tmax or self.thist or
self.tcount)
self.sstats = (self.smean or self.sstd or self.sbias or self.srms or
self.scrms or self.savail or self.smin or self.smax or self.shist or
self.scount)
self.tsum = self.tmean or self.tbias or self.tstd or self.tcrms or self.tcorr
self.tsqr = self.tstd or self.trms or self.tcrms or self.tcorr
self.tprod = self.trms or self.tcrms or self.tcorr
self.ssum = self.smean or self.sbias or self.sstd or self.scrms or self.scorr
self.ssqr = self.sstd or self.srms or self.scrms or self.scorr
self.sprod = self.srms or self.scrms or self.scorr
if self.shist or self.thist:
self.nbins = self.bins.shape[0]-1
# Check spatial statistics
if self.sstats and items[0].ndim<2:
self.sstats = self.smean = self.sstd = self.sbias = self.srms = \
self.scrms = self.scorr = self.savail = self.smin = self.smax = \
self.shist = self.scount = False
# Init template for output temporal statistics
if self.tstats:
self._ttemplates = ()
self._thtemplates = None
if self.thist:
self._thtemplates = ()
selspace = 0 if self.withtime else slice(None)
for item in items:
tpl = item[selspace].clone()
for att in tpl.attributes:
delattr(tpl, att)
self._ttemplates += tpl,
if self.thist:
htpl = self._template_t2ht_(tpl)
self._thtemplates += htpl,
self._tbase = items[0].filled()[selspace]*0.
self._tbase.shape = -1
if self.thist:
self._thbase = N.resize(self._tbase.astype('l'),
(self.nbins, self._tbase.size))
# Save some attributes
self._atts = ()
for var in items:
attr = {}
for att in 'id', 'units', 'long_name', 'standard_name':
if hasattr(var, att):
attr[att] = getattr(var, att)
self._atts += attr,
# Init accumulated variables
# - temporal statistics
if self.tstats:
self._nts = []
if self.tsum:
self._tsum = self._tbase.copy(),
if self.dual: self._tsum += self._tbase.copy(),
# - square
if self.tsqr:
self._tsqr = self._tbase.copy(),
if self.dual: self._tsqr += self._tbase.copy(),
# - product
if self.tprod: self._tprod = self._tbase.copy()
# - count
self._tcount = self._tbase.copy().astype('l')
self.nt = 0
# - min/max
if self.tmin:
self._tmin = self._tbase.copy()+N.inf,
if self.dual: self._tmin += self._tbase.copy()+N.inf,
if self.tmax:
self._tmax = self._tbase.copy()-N.inf,
if self.dual: self._tmax += self._tbase.copy()-N.inf,
# - histograms
if self.thist:
self._thist = self._thbase.copy(),
if self.dual:
self._thist += self._thbase.copy(),
# - spatial statistics
if self.sstats:
self._sstats = {}
# for name in self.all_stats:
# if not getattr(self, 's'+name): continue
# self._sstats[name] = []
# setattr(self, '_s'+name, [])
self._stimes = tuple([[] for i in xrange(len(items))]) if self.withtime else None
self._scount = []
self.ns = items[0].size/items[0].shape[0]
# Checks
if not self.tstats and not self.sstats:
raise StatAccumError("Cant' accumulate statistics since they are all are OFF.")
if self.dual:
if len(items)!=2:
raise StatAccumError("You must always provide two variables for accumulating stats")
if items[0].shape!=items[1].shape:
raise StatAccumError("Compared variables must have the same shape: %s != %s"%(
items[0].shape, items[1].shape))
# As 2D (time-space) zero-filled arrays
nt = items[0].shape[0] if self.withtime else 1
self._nts.append(nt)
item0 = items[0].asma().reshape(nt, -1)
mask0 = N.ma.getmaskarray(item0)
if self.dual:
item1 = items[1].asma().reshape(nt, -1)
mask1 = N.ma.getmaskarray(item1)
mask = mask0 | mask1
else:
mask = mask0
item0[mask] = N.ma.masked
if self.tmax or self.tmin or self.smax or self.smin:
item0m = item0
item0 = item0.filled(0.)
if self.dual:
item1[mask] = N.ma.masked
if self.tmax or self.tmin or self.smax or self.smin:
item1m = item1
item1 = item1.filled(0.)
# Accumulate
# - temporal statistics
if self.tstats:
self._tcount += (~mask).astype('l').sum(axis=0)
self.nt += items[0].shape[0]
if self.tsum:
self._tsum[0][:] += item0.sum(axis=0)
if self.dual: self._tsum[1][:] += item1.sum(axis=0)
if self.tsqr:
self._tsqr[0][:] += (item0**2).sum(axis=0)
if self.dual: self._tsqr[1][:] += (item1**2).sum(axis=0)
if self.tprod:
self._tprod[:] += (item0*item1).sum(axis=0)
if self.tmin:
vmin = item0m.min(axis=0)
self._tmin[0][:] = N.ma.where(vmin<self._tmin[0], vmin, self._tmin[0])
if self.dual:
vmin = item1m.min(axis=0)
self._tmin[1][:] = N.ma.where(vmin<self._tmin[1], vmin, self._tmin[1])
del vmin
if self.tmax:
vmax = item0m.max(axis=0)
self._tmax[0][:] = N.ma.where(vmax>self._tmax[0], vmax, self._tmax[0])
if self.dual:
vmax = item1m.max(axis=0)
self._tmax[1][:] = N.ma.where(vmax>self._tmax[1], vmax, self._tmax[1])
del vmax
if self.thist:
indices0 = N.digitize(item0.flat, self.bins)
indices0[mask.ravel()] = 0
if self.dual:
indices1 = N.digitize(item1.flat, self.bins)
indices1[mask.ravel()] = 0
for ib in xrange(self.nbins):
valid0 = indices0.reshape(item0.shape)==ib+1
valid0 &= ~mask
self._thist[0][ib] += valid0.sum(axis=0)
if self.dual:
valid1 = indices1.reshape(item1.shape)==ib+1
valid1 &= ~mask
self._thist[1][ib] += valid1.sum(axis=0)
del indices0, valid0
if self.dual: del indices1, valid1
# - spatial statistics
if self.sstats:
# Count
scount = (~mask).astype('l').sum(axis=1)
# if self.savail:
self._scount.append(scount)
# Base
if self.ssum:
ssum = item0.sum(axis=1),
if self.dual: ssum += item1.sum(axis=1),
if self.ssqr:
ssqr = (item0**2).sum(axis=1),
if self.dual:
ssqr += (item1**2).sum(axis=1),
if self.sprod:
sprod = (item0*item1).sum(axis=1)
if self.smin:
smin = item0m.min(axis=1),
if self.dual:
smin += item1m.min(axis=1),
if self.smax:
smax = item0m.max(axis=1),
if self.dual:
smax += item1m.max(axis=1),
if self.shist:
indices0 = N.digitize(item0.flat, self.bins)
indices0[mask.ravel()] = 0
if self.dual:
indices1 = N.digitize(item1.flat, self.bins)
indices1[mask.ravel()] = 0
shist = N.zeros(item0.shape[:1]+(self.nbins, ), 'l'),
if self.dual: shist += shist[0].copy(),
for ib in xrange(self.nbins):
valid0 = indices0.reshape(item0.shape)==ib+1
shist[0][:, ib] += valid0.sum(axis=1)
if self.dual:
valid1 = indices1.reshape(item1.shape)==ib+1
shist[1][:, ib] += valid1.sum(axis=1)
# Stats
sstats = self._base2stats_(self.ns, scount, ssum if self.ssum else None,
ssqr if self.ssqr else None, sprod if self.sprod else None,
smin if self.smin else None, smax if self.smax else None,
shist if self.shist else None,
self.smean, self.sstd, self.sbias, self.srms, self.scrms,
self.scorr, self.savail, self.scount,
self.smin, self.smax, self.shist)
for key, val in sstats.iteritems():
# self._sstats[key].append(val)
# getattr(self, '_s'+key).append(val)
if isinstance(val, tuple):
self._sstats.setdefault(key, ([], []))
for i, v in enumerate(val):
self._sstats[key][i].append(v)
else:
self._sstats.setdefault(key, [])
self._sstats[key].append(val)
# Time axes (for concatenation)
if self.withtime:
for i, item in enumerate(items):
self._stimes[i].append(item.getAxis(0))
# Save iterative status
self.iterindex += 1
if self.withtime:
self.lasttime = items[0].getTime().asComponentTime()[-1]
def __iadd__(self, items):
if isinstance(items, tuple):
self.append(*items)
else:
self.append(items)
return self
[docs] def isempty(self):
return self.iterindex==0
def _template_t2ht_(self, tpl):
htpl = MV2.resize(tpl.astype('l'), (self.nbins,)+tpl.shape)
htpl.setAxisList([self._baxis]+tpl.getAxisList())
set_grid(htpl, get_grid(tpl))
cp_atts(tpl, htpl)
return htpl
def _asarray_(self, a):
"""Merge a list of arrays along time"""
if isinstance(a, list):
return N.concatenate(a, axis=0)
return a
def _aslist_(self, a):
"""Split an array along time"""
return N.split(a, N.cumsum(self._nts[:-1]), axis=0)
def _dump_array_(self, f, var, id, axis):
"""Dump an array or a list of them in a nctdf file"""
istime = isinstance(var, list)
ishist = 'hist' in id
var = self._asarray_(var)
var = MV2.array(var, copy=0, id=id)
var.setAxis(int(not istime and ishist), axis)
if ishist:
var.setAxis(istime, self._baxis)
if var.dtype.char in 'fd':
var.set_fill_value(1e20)
else:
var.set_fill_value(-1)
return f.write(var, extend=0)
def _load_array_(self, f, id):
var = f(id)
istime = 't' in var.getOrder()
var = var.filled()
if istime:
var = self._aslist_(var)
return var
[docs] def dump(self, restart_file=None):
"""Dump the current instance to a netcdf file"""
# File
netcdf4()
if restart_file is None:
restart_file = self.restart_file
if restart_file is None:
restart_file = self.default_restart_file
self.restart_file = restart_file
if os.path.exists(restart_file):
os.remove(restart_file)
f = cdms2.open(restart_file, 'w')
# Config
# - what was initially asked and some more
for sname in self.all_stats + ('sum', 'sqr', 'prod', 'stats'):
for st in 'st':
value = getattr(self, st+sname)
if value is None: continue
if isinstance(value, bool): value = int(value)
setattr(f, st+sname, value)
# - current status
f.iterindex = self.iterindex
f.nitems = int(self.nitems)
f.withtime = -1 if self.withtime is None else int(self.withtime)
if self.withtime and self.lasttime:
f.lasttime = str(self.lasttime)
if self.bins is None:
f.bin_edges = 0
else:
f.bin_edges = self.bins
if f.nitems==0: # Still no data
f.close()
return
# - already had some data
f.dual = int(self.dual)
f.ns = self.ns
f.nt = self.nt
f.nts = self._nts
f.tstats = int(self.tstats)
f.sstats = int(self.sstats)
# Spatial statistics
if self.sstats:
# Time axes
if self.withtime:
taxes = ()
for i, tt in enumerate(self._stimes):
taxis = MV2_axisConcatenate(tt)
taxis.stataccum_oldid = taxis.id
taxis.id = 't'+str(i)
taxes += taxis,
else:
taxes = [cdms2.createAxis(N.arange(self.nt), id='t')]
# Count
self._dump_array_(f, var=self._scount, id='scount', axis=taxes[0])
# Other stats
for key, stats in self._sstats.items():
if isinstance(stats, tuple):
for i, stat in enumerate(stats):
self._dump_array_(f, var=stat, id='s%s%s'%(key, str(i)),
axis=taxes[i])
else:
self._dump_array_(f, var=stats, id='s'+key, axis=taxes[0])
# Temporal statistics
if self.tstats:
# Main axis
axis = cdms2.createAxis(N.arange(self.ns), id='s')
# Count
self._dump_array_(f, var=self._tcount, id='tcount', axis=axis)
# Other stats
for key in self._dual_accums+self._single_accums:
if not getattr(self, 't'+key): continue
value = getattr(self, '_t'+key)
if key in self._dual_accums:
self._dump_array_(f, var=value, id='t'+key, axis=axis)
else:
for i, stat in enumerate(value):
self._dump_array_(f, var=stat, id='t%s%s'%(key, str(i)), axis=axis)
# Templates
ttemplates = [self._ttemplates]
# if self.thist:
# ttemplates.append(self._thtemplates)
for ttpls in ttemplates:
for i, ttpl in enumerate(ttpls):
# ttpl = ttpl.clone(copyData=0)
# for ia, axis in enumerate(ttpl.getAxisList()): #FIXME:grid cloning
# ttpl.setAxis(ia, axis.clone(copyData=0))
_add_id_prefix_(ttpl, 'var%i_'%i, exc=self._baxis)
f.write(ttpl)
_rm_id_prefix_(ttpl, 'var%i_'%i, exc=self._baxis)
# Attributes
for ivar, atts in enumerate(self._atts):
var = MV2.zeros(1)
set_atts(var, atts)
var.id = 'var%i_atts'%ivar
var.stataccum_id = atts['id']
var.getAxis(0).id = 'var_att_axis'
f.write(var)
# self._ttemplates, self._thtemplates, self._atts, self._tbase
f.close()
return f.id
[docs] def load(self, restart_file=None, iterindex=None, nowtime=None):
"""Load the current instance from a netcdf file
:Params:
- **restart_file**, optional: Netcdf restart file.
- **iterindex**, optional: If given, the restart file is not loaded if
``iterindex`` is greater or equal to the file's ``iterindex`` attribute.
- **nowtime**, optional: If given, the restart file is not loaded if
``nowtime`` is greater or equal to the file's ``lasttime`` attribute.
"""
# File
if restart_file is None:
restart_file = self.restart_file
if restart_file is None:
restart_file = self.default_restart_file
self.restart_file = restart_file
f = cdms2.open(restart_file)
# Config
# - check status
if iterindex is not None:
self.iterindex = iterindex
if hasattr(self, 'iterindex') and f.iterindex<=self.iterindex:
return -1
if nowtime is not None:
self.lasttime = comptime(nowtime)
if (hasattr(self, 'lasttime') and f.withtime>0 and self.lasttime
and reltime(f.lasttime, 'hours since 2000').value <=
reltime(self.lasttime, 'hours since 2000').value):
return -1
# - what was initially asked and some more
for sname in self.all_stats + ('sum', 'sqr', 'prod', 'stats'):
for st in 'st':
if not hasattr(f, st+sname): continue
value = getattr(f, st+sname)
setattr(self, st+sname, bool(value))
# - current status
self.iterindex = int(f.iterindex)
self.nitems = int(f.nitems)
if f.withtime==-1:
self.withtime = None
else:
self.withtime = bool(f.withtime)
if f.withtime:
self.lasttime = cdtime.s2c(f.lasttime)
if N.isscalar(f.bin_edges):
self.bins = None
else:
self.bins = N.asarray(f.bin_edges)
self.nbins = self.bins.shape[0]-1
self._baxis = f.getAxis('hbin').clone()
if self.nitems==0: # Still no data
f.close()
return 0
# - already had some data
self.dual = bool(f.dual)
self.ns = int(f.ns)
self.nt = int(f.nt)
self._nts = f.nts.tolist()
self.tstats = bool(f.tstats)
self.sstats = bool(f.sstats)
if not self.withtime:
self._stimes = None
# Spatial statistics
if self.sstats:
# Time axes
if self.withtime:
self._stimes = tuple([[] for i in xrange(self.nitems)])
for i, tt in enumerate(self._stimes):
taxis = f.getAxis('t'+str(i))
tvalues = self._aslist_(taxis[:])
oldid = taxis.stataccum_oldid
for tvals in tvalues:
tx = create_time(tvals, taxis.units, id=oldid)
cp_atts(taxis, tx, id=False, exclude=[oldid])
self._stimes[i].append(tx)
# Count
self._scount = self._load_array_(f, id='scount')
# Other stats
self._sstats = {}
for key in self.single_stats:
if not self.dual: # single var
vid = 's' + key
if vid not in f.variables:
continue
self._sstats[key] = self._load_array_(f, vid),
else: # two vars
for i in xrange(self.nitems):
vid = 's%s%s'%(key, str(i))
if vid not in f.variables:
break
self._sstats.setdefault(key, ())
self._sstats[key] += self._load_array_(f, vid),
for key in self.dual_stats:
vid = 's%s'%key
if vid in f.variables:
self._sstats[key] = self._load_array_(f, vid)
# Temporal statistics
if self.tstats:
# Count
self._tcount = self._load_array_(f, 'tcount')
# Other stats
for key in self._dual_accums+self._single_accums:
tid = 't'+key
if not getattr(self, tid): continue
if key in self._dual_accums:
value = self._load_array_(f, tid)
setattr(self, '_'+tid, value)
else:
value = ()
for i in xrange(self.nitems):
value += self._load_array_(f, tid+str(i)),
setattr(self, '_'+tid, value)
# Templates
# - base arrays
self._tbase = N.zeros(self.ns)
if self.thist:
self._thbase = N.zeros((self.nbins, self.ns), 'l')
# - cdat templates
self._ttemplates = ()
self._thtemplates = None
if self.thist:
self._thtemplates = ()
for i in xrange(self.nitems):
prefix = 'var%i_'%i
for vname in f.variables:
if vname.startswith(prefix) and vname != prefix+'atts': break
ttpl = f(vname)
_rm_id_prefix_(ttpl, 'var%i_'%i, exc=self._baxis)
self._ttemplates += ttpl,
if self.thist:
self._thtemplates += self._template_t2ht_(ttpl),
# Attributes
self._atts = ()
for ivar in xrange(self.nitems):
attrs = f['var%i_atts'%ivar].attributes.copy()
attrs['id'] = attrs['stataccum_id']
del attrs['stataccum_id']
self._atts += attrs,
f.close()
return self.iterindex
@staticmethod
def _base2stats_(size, count, sum, sqr, prod, vmin, vmax, hist,
domean, dostd, dobias, dorms, docrms, docorr, doavail,
docount, domin, domax,
dohist):
# Denominators
if isinstance(count, N.ndarray):
bad = count==0
count_ = N.where(bad, 1, count)
countd = N.where(bad, 0., 1./count_)
# bad = count<2
# count = N.where(bad, 2, count)
# countdm1 = N.where(bad, 0., 1./(count-1))
# del count
else:
countd = 0. if count == 0 else 1./count
# countdm1 = 0. if scount < 2 else 1./(count-1)
# Check dual inputs
for vname in ('count',' sum',' sqr',' prod',' vmin',' vmax',' hist'):
val = eval(vname)
if val is None: continue
dual = len(val)==2
if dual: break
if dual:
if sum is not None: sum0, sum1 = sum
if sqr is not None: sqr0, sqr1 = sqr
else:
if sum is not None: sum0 = sum[0]
if sqr is not None: sqr0 = sqr[0]
# Init
results = {}
# Count
if docount:
results['count'] = count
# Availability
if doavail:
results['avail'] = 1.* count / size
# Mean
if domean:
results['mean'] = sum0 * countd
if dual:
results['mean'] = results['mean'], sum1 * countd
# Standard deviation & sum(Xi'^2) & sum(Yi'^2)
if dostd or docorr:
sqrp0 = - sum0**2 * countd # sum(Xi'^2)
sqrp0 += sqr0
if dostd:
try:
results['std'] = N.sqrt(sqrp0 * countd)#m1)
except:
pass
if not docorr: del sqrp0
if dual:
sqrp1 = - sum1**2 * countd # sum(Yi'^2)
sqrp1 += sqr1
if dostd:
results['std'] = results['std'], N.sqrt(sqrp1 * countd)
if not docorr: del sqrp1
# Bias
if dobias:
results['bias'] = (sum1-sum0) * countd
# Absolute RMS
if dorms:
rms = sqr0 + sqr1
rms -= prod * 2
results['rms'] = N.sqrt(rms * countd)#m1)
del sqr0, sqr1, rms
# sum(Xi'Yi')
if docrms or docorr:
prodp = prod - sum0 * sum1 * countd
# Centered RMS
if docrms:
crms = sqrp0 + sqrp1
crms -= prodp * 2
if not docorr: del prodp
results['crms'] = N.sqrt(crms * countd)#m1)
del crms
# Correlation
if docorr:
results['corr'] = prodp / N.sqrt(sqrp0*sqrp1)
del prodp, sqrp0, sqrp1
# Min/max
if domin:
results['min'] = vmin if dual else vmin[0]
if domax:
results['max'] = vmax if dual else vmax[0]
# Histograms
if dohist:
results['hist'] = hist if dual else hist[0]
return results
def _checkstat_(self, name):
if self.iterindex==0:
raise StatAccumError("Can't get statistics '%s' since still nothing has been accumulated"%name)
if not getattr(self, name):
raise StatAccumError("Can't get statistics '%s' since it set to OFF")
def _get_masks_(self, stats, count):
masks = {}
count = N.asarray(count)
mask0 = count<1
# mask1 = count<2
del count
for key in stats:
if key in ['mean', 'bias']:
masks[key] = mask0
elif key in ['avail', 'count']:
masks[key] = None
else:
masks[key] = mask0
return masks
@staticmethod
def _format_var_(vv, name, mask=None, prefix=True, suffix=True, templates=None,
htemplates=None, attributes=None, single=True, **kwargs):
# Some inits
ist = name.startswith('t')
name = name[1:]
if prefix is True:
prefix = 'Temporal ' if ist else 'Spatial '
elif not prefix:
prefix = ''
kwargs['copy'] = 0
# Aways have a list
if isinstance(vv, tuple):
vv = list(vv)
single = False
elif not isinstance(vv, list):
vv = [vv]
single = True
elif single:
single = len(vv)==1
else:
single = False
dual = len(vv)==2
if templates is not None and not isinstance(templates, (list, tuple)):
templates = [templates]*len(vv)
if htemplates is not None and not isinstance(htemplates, (list, tuple)):
htemplates = [htemplates]*len(vv)
if attributes is None:
attributes = ({}, {})
elif not isinstance(attributes, (list, tuple)):
attributes = [attributes]*len(vv)
# Long_name
long_name = prefix
if name=='avail':
long_name += 'availability'
if name=='count':
long_name += 'count'
elif name=='mean':
long_name += 'average'
elif name=='std':
long_name += 'standard deviation'
elif name=='bias':
long_name += 'bias'
elif name=='rms':
long_name += 'absolute RMS difference'
elif name=='crms':
long_name += 'centered RMS difference'
elif name=='corr':
long_name += 'correlation'
elif name=='hist':
long_name += 'histogram'
# Type change
if 'count' in name:
dtype = 'l'
else:
dtype = None
for i, v in enumerate(vv):
# From template or not
hvar = v.ndim==2 # Histogram-like variables: first axis = bins
hist = htemplates and hvar
if templates is not None:
# From axis (sstats)
if isaxis(templates[i]):
# if hvar and not ist:
# v = N.ma.transpose(v) # time first, not bins
var = MV2.asarray(v)
var.setAxis(0, templates[i])
# try:
# var.setAxis(int(hist), templates[i])
# except:
# pass
if hist:
var.setAxis(1, htemplates[i])
# From variable (tstats)
else:
try:
var = (htemplates if hvar else templates)[i].clone()
except:
pass
try:
var[:] = v.reshape(var.shape)
except:
pass
else:
var = MV2.asarray(v)
del v
# Type
if dtype:
var = var.astype(dtype)
# Mask
if mask is not None:
func = N.resize if var.size!=mask.size else N.reshape
var[:] = MV2.masked_where(func(mask, var.shape), var, copy=0)
# Attributes
# - id and long_name
var.id = attributes[i]['id']+'_'+name
var.long_name = long_name
if suffix and isinstance(suffix, basestring):
var.long_name += suffix
# - single stats
if name in ['mean', 'std', 'hist', 'min', 'max']:
if "units" in attributes[i]:
var.units = attributes[i]['units']
if suffix is True and "long_name" in attributes[i]:
var.long_name += ' of '+attributes[i]['long_name']
# - dual stats
elif name in ['bias', 'rms', 'crms']:
for attr in attributes: # loop on both variables attributes
if "units" in attr and not hasattr(var, 'units'):
var.units = attr['units']
if suffix is True and "long_name" in attr and var.long_name==long_name:
var.long_name += ' of '+attr['long_name']
vv[i] = var
if single:
return vv[0]
return tuple(vv)
[docs] def get_tstats(self, **kwargs):
"""Get all temporal statistics as a dictionary"""
if not self.tstats: return {}
# From cache
if getattr(self, '_cache_titerindex', -1)==self.iterindex:
return self._cache_tstats
# Compute stats
tstats = self._base2stats_(self.nt, self._tcount, self._tsum if self.tsum else None,
self._tsqr if self.tsqr else None, self._tprod if self.tprod else None,
self._tmin if self.tmin else None, self._tmax if self.tmax else None,
self._thist if self.thist else None,
self.tmean, self.tstd, self.tbias,
self.trms, self.tcrms, self.tcorr, self.tavail, self.tcount,
self.tmin, self.tmax, self.thist)
# Masks
masks = self._get_masks_(tstats, self._tcount)
# Format and cache
kwargs.update(templates=self._ttemplates, htemplates=self._thtemplates,
attributes=self._atts, single=True, baxis=self._baxis)
self._cache_tstats = dict([(name,
self._format_var_(var, 't'+name, mask=masks[name], **kwargs))
for name, var in tstats.iteritems()])
self._cache_titerindex = self.iterindex
del masks
return self._cache_tstats
[docs] def get_tavail(self, **kwargs):
"""Get the temporal availability"""
self._checkstat_('tavail')
return self.get_tstats()['avail']
[docs] def get_tcount(self, **kwargs):
"""Get the temporal count"""
self._checkstat_('tcount')
return self.get_tstats()['count']
[docs] def get_tmean(self, **kwargs):
"""Get the temporal standard deviation"""
self._checkstat_('tmean')
return self.get_tstats()['mean']
[docs] def get_tstd(self, **kwargs):
"""Get the temporal mean"""
self._checkstat_('tstd')
return self.get_tstats()['std']
[docs] def get_tbias(self, **kwargs):
"""Get the temporal bias"""
self._checkstat_('tbias')
return self.get_tstats()['bias']
[docs] def get_trms(self, **kwargs):
"""Get the temporal absolute RMS difference"""
self._checkstat_('trms')
return self.get_tstats()['rms']
[docs] def get_tcrms(self, **kwargs):
"""Get the temporal centered RMS difference"""
self._checkstat_('tcrms')
return self.get_tstats()['crms']
[docs] def get_tcorr(self, **kwargs):
"""Get the temporal correlation"""
self._checkstat_('tcorr')
return self.get_tstats()['corr']
[docs] def get_thist(self, **kwargs):
"""Get the temporal histogram"""
self._checkstat_('thist')
return self.get_tstats()['hist']
[docs] def get_sstats(self, **kwargs):
"""Get all spatial statistics as a dictionary"""
if not self.sstats: return {}
# From cache
if getattr(self, '_cache_siterindex', None)==self.iterindex:
return self._cache_sstats
# Aggregate pure numeric stats
sstats = {}
for stype, vals in self._sstats.items():
if isinstance(vals, tuple):
sstats[stype] = self._asarray_(vals[0]), self._asarray_(vals[1])
else:
sstats[stype] = self._asarray_(vals)
# # Availability
# if self.savail:
# if self.smask is not None:
# if isinstance(self.smask , N.ndarray):
# maxcount = (~self.smask).astype('l').sum()
# else:
# maxcount = self.smask
# else:
# maxcount = self._scount.max()
# sstats['avail'] = self._scount*1./maxcount
# Masks
masks = self._get_masks_(sstats, self._asarray_(self._scount))
# Time axes
stimes = None if self._stimes is None else \
[MV2_axisConcatenate(stime) for stime in self._stimes]
# Format and cache
kwargs.update(templates=stimes, htemplates=self._baxis, attributes=self._atts)
self._cache_sstats = dict([(name,
self._format_var_(var, 's'+name, masks[name], **kwargs))
for name, var in sstats.iteritems()])
self._cache_siterindex = self.iterindex
del masks
return self._cache_sstats
[docs] def get_savail(self, **kwargs):
"""Get the spatial availability"""
self._checkstat_('savail')
return self.get_sstats()['avail']
[docs] def get_scount(self, **kwargs):
"""Get the spatial count"""
self._checkstat_('scount')
return self.get_sstats()['count']
[docs] def get_smean(self, **kwargs):
"""Get the spatial standard deviation"""
self._checkstat_('smean')
return self.get_sstats()['mean']
[docs] def get_sstd(self, **kwargs):
"""Get the spatial mean"""
self._checkstat_('sstd')
return self.get_sstats()['std']
[docs] def get_sbias(self, **kwargs):
"""Get the spatial bias"""
self._checkstat_('sbias')
return self.get_sstats()['bias']
[docs] def get_srms(self, **kwargs):
"""Get the spatial absolute RMS difference"""
self._checkstat_('srms')
return self.get_sstats()['rms']
[docs] def get_scrms(self, **kwargs):
"""Get the spatial centered RMS difference"""
self._checkstat_('scrms')
return self.get_sstats()['crms']
[docs] def get_scorr(self, **kwargs):
"""Get the spatial correlation"""
self._checkstat_('scorr')
return self.get_sstats()['corr']
[docs] def get_shist(self, **kwargs):
"""Get the spatial histogram"""
self._checkstat_('shist')
return self.get_sstats()['hist']
[docs] def get_stats(self):
"""Get all statistics as dict"""
stats = {}
for key, val in self.get_tstats().items():
stats['t'+key] = val
for key, val in self.get_sstats().items():
stats['s'+key] = val
return stats
def __getitem__(self, key):
return self.get_stats()[key]
def _get_var_and_axes_(var, exc=None):
"""Get a list of a variable and its axes"""
if exc is None: exc = []
if not isinstance(exc, (list, tuple)):
exc = [exc]
out = [var]+var.getAxisList()
if var.getGrid() and var.getLongitude().id!=var.getAxis(-1).id:
out.extend([var.getLongitude(), var.getLatitude()])
if not exc: return out
exc0 = map(id, exc)
exc1 = [e.id for e in exc]
return filter(lambda o: id(o) not in exc0 and o.id not in exc1, out)
def _rm_id_prefix_(var, prefix, exc=None):
"""Remove prefix in ids of a variable and its axes"""
for obj in _get_var_and_axes_(var, exc=exc):
if obj.id.startswith(prefix):
obj.id = obj.id[len(prefix):]
def _add_id_prefix_(var, prefix, exc=None):
"""Add prefix in ids of a variable and its axes when not present"""
for obj in _get_var_and_axes_(var, exc=exc):
if not obj.id.startswith(prefix):
obj.id = prefix+obj.id