Skip to content

Commit

Permalink
Merge pull request #43 from cedadev/cf17_table_caching
Browse files Browse the repository at this point in the history
Improve execution speed
  • Loading branch information
RosalynHatcher authored Mar 21, 2018
2 parents c0c16ea + cda1499 commit 445c4f7
Show file tree
Hide file tree
Showing 36 changed files with 204 additions and 79 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Alternatively, to install from source:

## Running the CF Checker

`cfchecks [-a|--area_types area_types.xml] [-s|--cf_standard_names standard_names.xml] [-v|--version CFVersion] file1 [file2...]`
`cfchecks [-a area_types.xml] [-s standard_names.xml] [-t cache_time_days ] [-v CFVersion] [-x] [--cachedir <dir>] file1 [file2...]`

### Environment Variables

Expand Down
183 changes: 152 additions & 31 deletions src/cfchecker/cfchecks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# CF Checker Version: See __version__
#
#-------------------------------------------------------------
''' cfchecker [-a|--area_types area_types.xml] [-s|--cf_standard_names standard_names.xml] [-v|--version CFVersion] file1 [file2...]
''' cfchecks [OPTIONS] file1 [file2...]
Description:
The cfchecker checks NetCDF files for compliance to the CF standard.
The CF Checker checks NetCDF files for compliance to the CF standard.
Options:
-a or --area_types:
Expand All @@ -30,11 +30,21 @@
-s or --cf_standard_names:
the location of the CF standard name table (xml)
-v or --version: CF version to check against, use auto to auto-detect the file version.
-t or --cache_time_days <days>:
set the cache retention period in days [default 10 days].
-v or --version:
CF version to check against, use auto to auto-detect the file version.
-x or --cache_tables:
cache the standard name, area type and region name tables.
--cache_dir:
directory in which to store cached tables
'''

import sys
import sys, os, time

if sys.version_info[:2] < (2,7):
from ordereddict import OrderedDict
Expand Down Expand Up @@ -125,21 +135,54 @@ def __cmp__(self, other):


class ConstructDict(ContentHandler):
"""Parse the xml standard_name table, reading all entries
into a dictionary; storing standard_name and units.
"""Parse the xml standard_name table, reading all entries into a dictionary;
storing standard_name and units.
If useShelve is True, a python shelve file will be used. If the file is
present and less than 600 seconds old, the existing contents will be used,
otherwise the standard name table will be parsed and written to the shelf
file.
"""
def __init__(self):
def __init__(self, useShelve=False, shelveFile=None, cacheTime=0, cacheDir='/tmp'):
self.inUnitsContent = 0
self.inEntryIdContent = 0
self.inVersionNoContent = 0
self.inLastModifiedContent = 0
self.dict = {}

self.current = False
self.useShelve = useShelve

if useShelve:
import shelve
if shelveFile == None:
self.shFile = os.path.join(cacheDir, 'cfexpr_cache')
else:
self.shFile = os.path.join(cacheDir, shelveFile)
now = time.time()
exists = os.path.isfile( self.shFile ) or os.path.isfile( '%s.dat' % self.shFile )
self.dict = shelve.open( self.shFile )

if exists:
ctime = self.dict['__contentTime__']
self.current = (now-ctime) < cacheTime
else:
self.current = False
if self.current:
self.version_number, self.last_modified = self.dict['__info__']
else:
self.dict['__contentTime__'] = now
else:
self.dict = {}

def close(self):
if self.useShelve:
self.dict['__info__'] = (self.version_number,self.last_modified)
self.dict.close()

def startElement(self, name, attrs):
# If it's an entry element, save the id
if name == 'entry':
id = normalize_whitespace(attrs.get('id', ""))
self.this_id = id
self.this_id = str(id)

# If it's the start of a canonical_units element
elif name == 'canonical_units':
Expand All @@ -148,7 +191,7 @@ def startElement(self, name, attrs):

elif name == 'alias':
id = normalize_whitespace(attrs.get('id', ""))
self.this_id = id
self.this_id = str(id)

elif name == 'entry_id':
self.inEntryIdContent = 1
Expand Down Expand Up @@ -186,7 +229,7 @@ def endElement(self, name):
# If it's the end of the entry_id element, find the units for the self.alias
elif name == 'entry_id':
self.inEntryIdContent = 0
self.entry_id = normalize_whitespace(self.entry_id)
self.entry_id = str(normalize_whitespace(self.entry_id))
try:
self.dict[self.this_id] = self.dict[self.entry_id]
except KeyError:
Expand All @@ -209,16 +252,48 @@ class ConstructList(ContentHandler):
"""Parse the xml area_type table, reading all area_types
into a list.
"""
def __init__(self):
def __init__(self, useShelve=False, shelveFile=None, cacheTime=0, cacheDir='/tmp'):
self.inVersionNoContent = 0
self.inLastModifiedContent = 0
self.list = []
self.current = False
self.useShelve = useShelve

if useShelve:
import shelve
if shelveFile == None:
self.shFile = os.path.join(cacheDir, 'cfexpr_cachel')
else:
self.shFile = os.path.join(cacheDir, shelveFile)
now = time.time()
exists = os.path.isfile( self.shFile ) or os.path.isfile( '%s.dat' % self.shFile )
self.list = shelve.open( self.shFile )

if exists:
ctime = self.list['__contentTime__']
self.current = (now-ctime) < cacheTime
else:
self.current = False
if self.current:
self.version_number,self.last_modified = self.list['__info__']
else:
self.list['__contentTime__'] = now

else:
self.list = set()

def close(self):
if self.useShelve:
self.list['__info__'] = (self.version_number,self.last_modified)
self.list.close()

def startElement(self, name, attrs):
# If it's an entry element, save the id
if name == 'entry':
id = normalize_whitespace(attrs.get('id', ""))
self.list.append(id)
id = str( normalize_whitespace(attrs.get('id', "")) )
if self.useShelve:
self.list[id] = id
else:
self.list.add(id)

elif name == 'version_number':
self.inVersionNoContent = 1
Expand Down Expand Up @@ -294,14 +369,20 @@ class FatalCheckerError(Exception):
#======================
class CFChecker:

def __init__(self, uploader=None, useFileName="yes", badc=None, coards=None, cfStandardNamesXML=STANDARDNAME, cfAreaTypesXML=AREATYPES, cfRegionNamesXML=REGIONNAMES, version=newest_version, debug=False, silent=False):
def __init__(self, uploader=None, useFileName="yes", badc=None, coards=None,
cfStandardNamesXML=STANDARDNAME, cfAreaTypesXML=AREATYPES,
cfRegionNamesXML=REGIONNAMES, cacheTables=False, cacheTime=0,
cacheDir='/tmp', version=newest_version, debug=False, silent=False):
self.uploader = uploader
self.useFileName = useFileName
self.badc = badc
self.coards = coards
self.standardNames = cfStandardNamesXML
self.areaTypes = cfAreaTypesXML
self.regionNames = cfRegionNamesXML
self.cacheTables = cacheTables
self.cacheTime = cacheTime
self.cacheDir = cacheDir
self.version = version
self.all_results = OrderedDict() # dictonary of results sorted by file and then by globals / variable
# and then by category
Expand Down Expand Up @@ -357,20 +438,26 @@ def checker(self, file):
# Set up dictionary of standard_names and their assoc. units
parser = make_parser()
parser.setFeature(feature_namespaces, 0)
self.std_name_dh = ConstructDict()
parser.setContentHandler(self.std_name_dh)
parser.parse(self.standardNames)
self.std_name_dh = ConstructDict(useShelve=self.cacheTables, cacheTime=self.cacheTime,
cacheDir=self.cacheDir)
if not self.std_name_dh.current:
parser.setContentHandler(self.std_name_dh)
parser.parse(self.standardNames)

if self.version >= vn1_4:
# Set up list of valid area_types
self.area_type_lh = ConstructList()
parser.setContentHandler(self.area_type_lh)
parser.parse(self.areaTypes)
self.area_type_lh = ConstructList(useShelve=self.cacheTables, shelveFile='cfarea_cache',
cacheTime=self.cacheTime, cacheDir=self.cacheDir)
if not self.area_type_lh.current:
parser.setContentHandler(self.area_type_lh)
parser.parse(self.areaTypes)

# Set up list of valid region_names
self.region_name_lh = ConstructList()
parser.setContentHandler(self.region_name_lh)
parser.parse(self.regionNames)
self.region_name_lh = ConstructList(useShelve=self.cacheTables, shelveFile='cfregion_cache',
cacheTime=self.cacheTime, cacheDir=self.cacheDir)
if not self.region_name_lh.current:
parser.setContentHandler(self.region_name_lh)
parser.parse(self.regionNames)

self._add_version("Using CF Checker Version %s" % __version__)
if not self.version:
Expand All @@ -395,6 +482,10 @@ def checker(self, file):
return self._checker()
finally:
self.f.close()
self.std_name_dh.close()
self.region_name_lh.close()
if self.version >= vn1_4:
self.area_type_lh.close()

def _init_results(self, filename):
"""
Expand Down Expand Up @@ -2048,7 +2139,8 @@ def chkFormulaTerms(self,varName,allCoordVars):
return

(stdName,modifier) = self.getStdName(var)

stdName=stdName.encode('ascii')

if not self.alias.has_key(stdName):
self._add_error("No formula defined for standard name: %s" % stdName, varName, code=scode)
# No formula available so can't validate formula_terms
Expand Down Expand Up @@ -2178,6 +2270,7 @@ def chkUnits(self,varName,allCoordVars):
# be consistent with units given in standard_name table
if hasattr(var, 'standard_name'):
(stdName,modifier) = self.getStdName(var)
stdName = stdName.encode('ascii')

# Is the Standard Name modifier number_of_observations being used.
if modifier == 'number_of_observations':
Expand Down Expand Up @@ -2867,6 +2960,12 @@ def getargs(arglist):
coards=None
version=newest_version
debug = False
# cacheTables : introduced to enable caching of CF standard name, area type and region name tables.
cacheTables = False
# default cache longevity is 1 day
cacheTime = 24*3600
# default directory to store cached tables
cacheDir = '/tmp'

# set to environment variables
if environ.has_key(standardnamekey):
Expand All @@ -2877,7 +2976,10 @@ def getargs(arglist):
regionnames=environ[regionnameskey]

try:
(opts,args)=getopt(arglist[1:],'a:bcdhlnr:s:v:',['area_types=','badc','coards','help','uploader','noname','region_names=','cf_standard_names=','version=', 'debug'])
(opts,args)=getopt(arglist[1:],'a:bcdhlnr:s:t:v:x',
['area_types=','badc','coards','debug','help','uploader',
'noname','region_names=','cf_standard_names=',
'cache_time_days=','version=','cache_tables','cache_dir='])
except GetoptError:
stderr.write('%s\n'%__doc__)
exit(1)
Expand All @@ -2892,6 +2994,8 @@ def getargs(arglist):
if a in ('-c','--coards'):
coards="yes"
continue
if a in ('--cache_dir'):
cacheDir=v.strip()
if a in ('-d','--debug'):
debug=True
continue
Expand All @@ -2910,6 +3014,9 @@ def getargs(arglist):
if a in ('-s','--cf_standard_names'):
standardname=v.strip()
continue
if a in ('-t','--cache_time_days'):
cacheTime=float(v)*24*3600
continue
if a in ('-v','--version'):
if v == 'auto':
version = CFVersion()
Expand All @@ -2924,19 +3031,33 @@ def getargs(arglist):
print "Performing check against newest version", newest_version
version = newest_version
continue
if a in ('-x','--cache_tables'):
cacheTables = True
continue

if len(args) == 0:
stderr.write('ERROR in command line\n\nusage:\n%s\n'%__doc__)
exit(1)

return (badc,coards,uploader,useFileName,standardname,areatypes,version,args,debug)
return (badc,coards,debug,uploader,useFileName,regionnames,standardname,areatypes,cacheDir,cacheTables,cacheTime,version,args)


def main():

(badc,coards,uploader,useFileName,standardName,areaTypes,version,files,debug)=getargs(sys.argv)
(badc,coards,debug,uploader,useFileName,regionnames,standardName,areaTypes,cacheDir,cacheTables,cacheTime,version,files)=getargs(sys.argv)

inst = CFChecker(uploader=uploader, useFileName=useFileName, badc=badc, coards=coards, cfStandardNamesXML=standardName, cfAreaTypesXML=areaTypes, version=version, debug=debug)
inst = CFChecker(uploader=uploader,
useFileName=useFileName,
badc=badc,
coards=coards,
cfRegionNamesXML=regionnames,
cfStandardNamesXML=standardName,
cfAreaTypesXML=areaTypes,
cacheDir=cacheDir,
cacheTables=cacheTables,
cacheTime=cacheTime,
version=version,
debug=debug)
for file in files:
#print
try:
Expand Down
2 changes: 1 addition & 1 deletion test_files/CF_1_0_OK.check
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ CHECKING NetCDF FILE: CF_1_0_OK.nc
=====================
Using CF Checker Version 3.0.6-dev
Checking against CF Version CF-1.0
Using Standard Name Table Version 49 (2018-02-13T08:44:33Z)
Using Standard Name Table Version 50 (2018-03-14T11:01:19Z)
Using Standardized Region Name Table Version 2 (12 June 2013)


Expand Down
4 changes: 2 additions & 2 deletions test_files/CF_1_7.check
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ CHECKING NetCDF FILE: CF_1_7.nc
=====================
Using CF Checker Version 3.0.6-dev
Checking against CF Version CF-1.7
Using Standard Name Table Version 49 (2018-02-13T08:44:33Z)
Using Area Type Table Version 6 (22 February 2017)
Using Standard Name Table Version 50 (2018-03-14T11:01:19Z)
Using Area Type Table Version 7 (14 March 2018)
Using Standardized Region Name Table Version 2 (12 June 2013)

ERROR: (2.6.3): Variable external_var2 named as an external variable must not be present in this file
Expand Down
2 changes: 1 addition & 1 deletion test_files/CRM018_test1.check
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ CHECKING NetCDF FILE: CRM018_test1.nc
=====================
Using CF Checker Version 3.0.6-dev
Checking against CF Version CF-1.0
Using Standard Name Table Version 49 (2018-02-13T08:44:33Z)
Using Standard Name Table Version 50 (2018-03-14T11:01:19Z)
Using Standardized Region Name Table Version 2 (12 June 2013)

WARN: (2.6.1): No 'Conventions' attribute present
Expand Down
2 changes: 1 addition & 1 deletion test_files/CRM021_test1.check
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ CHECKING NetCDF FILE: CRM021_test1.nc
=====================
Using CF Checker Version 3.0.6-dev
Checking against CF Version CF-1.0
Using Standard Name Table Version 49 (2018-02-13T08:44:33Z)
Using Standard Name Table Version 50 (2018-03-14T11:01:19Z)
Using Standardized Region Name Table Version 2 (12 June 2013)


Expand Down
2 changes: 1 addition & 1 deletion test_files/CRM024_test1.check
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ CHECKING NetCDF FILE: CRM024_test1.nc
=====================
Using CF Checker Version 3.0.6-dev
Checking against CF Version CF-1.0
Using Standard Name Table Version 49 (2018-02-13T08:44:33Z)
Using Standard Name Table Version 50 (2018-03-14T11:01:19Z)
Using Standardized Region Name Table Version 2 (12 June 2013)

ERROR: (2.6.1): This netCDF file does not appear to contain CF Convention data.
Expand Down
Loading

0 comments on commit 445c4f7

Please sign in to comment.