77
88import numpy as np
99import pandas as pd
10- from numba import jit
10+ from numba import jit , njit
1111import datetime
1212from dateutil .relativedelta import relativedelta
1313import timeit
1414
15-
1615# look up attributes NAME, data type (Integer; Real; String) and data length by attribute number
1716attrinfo = {1 :('TSTYPE' ,'S' ,4 ), 2 :('STAID' ,'S' ,16 ), 11 :('DAREA' ,'R' ,1 ),
1817 17 :('TCODE' ,'I' ,1 ), 27 :('TSBYR' ,'I' ,1 ), 28 :('TSBMO' ,'I' ,1 ),
2726freq = {7 :'100YS' , 6 :'YS' , 5 :'MS' , 4 :'D' , 3 :'H' , 2 :'min' , 1 :'S' } # pandas date_range() frequency by TCODE, TGROUP
2827
2928
30-
31- def readWDM (wdmfile , hdffile , jupyterlab = True ):
29+ def readWDM (wdmfile , hdffile , compress_output = True ):
3230 iarray = np .fromfile (wdmfile , dtype = np .int32 )
3331 farray = np .fromfile (wdmfile , dtype = np .float32 )
3432
3533 if iarray [0 ] != - 998 :
36- print ('Not a WDM file, magic number is not -990. Stopping!' )
37- return
34+ raise ValueError ('Provided file does not match WDM format. First int32 should be -998.' )
3835 nrecords = iarray [28 ] # first record is File Definition Record
3936 ntimeseries = iarray [31 ]
4037
@@ -43,7 +40,7 @@ def readWDM(wdmfile, hdffile, jupyterlab=True):
4340 if not (iarray [index ]== 0 and iarray [index + 1 ]== 0 and iarray [index + 2 ]== 0 and iarray [index + 3 ]== 0 ) and iarray [index + 5 ]== 1 :
4441 dsnlist .append (index )
4542 if len (dsnlist ) != ntimeseries :
46- print ( 'PROGRAM ERROR, wrong number of DSN records found' )
43+ raise RuntimeError ( f'Wrong number of Time Series Records found expecting: { ntimeseries } found: { len ( dsnlist ) } ' )
4744
4845 with pd .HDFStore (hdffile ) as store :
4946 summary = []
@@ -105,13 +102,13 @@ def readWDM(wdmfile, hdffile, jupyterlab=True):
105102 dattr [name ] = '' .join ([_inttostr (iarray [k ]) for k in range (ptr , ptr + length // 4 )]).strip ()
106103
107104 # Get timeseries timebase data
108- groups = [] #renaming to groups to be consistent with WDM documentation
105+ groups = []
109106 for i in range (pdat + 1 , pdatv - 1 ):
110107 a = iarray [index + i ]
111108 if a != 0 :
112109 groups .append (_splitposition (a ))
113110 if len (groups ) == 0 :
114- continue # WDM preallocation, but nothing saved here yet
111+ continue
115112
116113 srec , soffset = groups [0 ]
117114 start = _splitdate (iarray [srec * 512 + soffset ])
@@ -130,28 +127,18 @@ def readWDM(wdmfile, hdffile, jupyterlab=True):
130127 tindex = pd .date_range (start = start , end = cindex [- 1 ], freq = str (tstep ) + freq [tcode ])
131128 counts = np .diff (np .searchsorted (tindex , cindex ))
132129
133- ## Get timeseries data
134- #floats = np.zeros(sum(counts), dtype=np.float32)
135- #findex = 0
136- #for (rec,offset),count in zip(groups, counts):
137- #findex = getfloats(iarray, farray, floats, findex, rec, offset, count, finalindex, tcode, tstep)
138-
139- #replaced with group/block processing approach
140- dates , values = _process_groups (iarray , farray , groups , tgroup )
141-
142130 ## Write to HDF5 file
131+ dates , values = _process_groups (iarray , farray , groups , tgroup )
143132 series = pd .Series (values , index = dates )
144133 dsname = f'TIMESERIES/TS{ dsn :03d} '
145- # series.to_hdf(store, dsname, complib='blosc', complevel=9)
146- if jupyterlab :
147- series .to_hdf (store , dsname , complib = 'blosc' , complevel = 9 ) # This is the official version
134+ if compress_output :
135+ series .to_hdf (store , dsname , complib = 'blosc' , complevel = 9 )
148136 else :
149- series .to_hdf (store , dsname , format = 't' , data_columns = True ) # show the columns in HDFView
137+ series .to_hdf (store , dsname , format = 't' , data_columns = True )
150138
151139 data = [str (tindex [0 ]), str (tindex [- 1 ]), str (tstep ) + freq [tcode ],
152140 len (series ), dattr ['TSTYPE' ], dattr ['TFILL' ]]
153141 columns = ['Start' , 'Stop' , 'Freq' ,'Length' , 'TSTYPE' , 'TFILL' ]
154- # search = ['STAID', 'STNAM', 'SCENARIO', 'CONSTITUENT','LOCATION']
155142 for x in columns_to_add :
156143 if x in dattr :
157144 data .append (dattr [x ])
@@ -160,30 +147,29 @@ def readWDM(wdmfile, hdffile, jupyterlab=True):
160147 summary .append (data )
161148 summaryindx .append (dsname [11 :])
162149
163-
164150 dfsummary = pd .DataFrame (summary , index = summaryindx , columns = columns )
165151 store .put ('TIMESERIES/SUMMARY' ,dfsummary , format = 't' , data_columns = True )
166152 return dfsummary
167153
168-
169154def _todatetime (yr = 1900 , mo = 1 , dy = 1 , hr = 0 ):
170- '''takes yr,mo,dy,hr information then returns its datetime64'''
171- if hr == 24 :
155+ if hr == 24 :
172156 return datetime .datetime (yr , mo , dy , 23 ) + pd .Timedelta (1 ,'h' )
173157 else :
174158 return datetime .datetime (yr , mo , dy , hr )
175159
176160def _splitdate (x ):
177- '''splits WDM int32 DATWRD into year, month, day, hour -> then returns its datetime64'''
178161 return _todatetime (x >> 14 , x >> 10 & 0xF , x >> 5 & 0x1F , x & 0x1F ) # args: year, month, day, hour
179162
180163def _splitcontrol (x ):
181- ''' splits int32 into (qual, compcode, units, tstep, nvalues)'''
182- return (x & 0x1F , x >> 5 & 0x3 , x >> 7 & 0x7 , x >> 10 & 0x3F , x >> 16 )
164+ nval = x >> 16
165+ ltstep = int (x >> 10 & 0x3f ) #relative_delta doesn't handle numpy.32bitInt correctly so convert to python
166+ ltcode = int (x >> 7 & 0x7 )
167+ comp = x >> 5 & 0x3
168+ qual = x & 0x1f
169+ return nval , ltstep , ltcode , comp , qual
183170
184171def _splitposition (x ):
185- ''' splits int32 into (record, offset), converting to Pyton zero based indexing'''
186- return ((x >> 9 ) - 1 , (x & 0x1FF ) - 1 )
172+ return ((x >> 9 ) - 1 , (x & 0x1FF ) - 1 ) #args: record, offset
187173
188174def _inttostr (i ):
189175 return chr (i & 0xFF ) + chr (i >> 8 & 0xFF ) + chr (i >> 16 & 0xFF ) + chr (i >> 24 & 0xFF )
@@ -209,9 +195,6 @@ def _deltatime(ltstep, ltcode):
209195 seconds = ltstep if ltcode == 1 else 0 )
210196 return deltat
211197
212- # HTAO - an alternative implementation of process_group:
213- # 1. used lists to replace numpy matrix;
214- # 2. added a loop to iterate each group and used ending date as the ending condition
215198def _process_groups (iarray , farray , groups , tgroup ):
216199
217200 date_array = []
@@ -226,29 +209,16 @@ def _process_groups(iarray, farray, groups, tgroup):
226209 index += 1
227210
228211 while current_date < lGroupEndDate :
229- #read block control word
230- x = iarray [index ] # block control word or maybe date word at start of group
231- nval = x >> 16
232- ltstep = int (x >> 10 & 0x3f ) #relative_delta doesn't handle numpy.32bitInt correctly so convert to python
233- ltcode = int (x >> 7 & 0x7 )
234- comp = x >> 5 & 0x3
235- qual = x & 0x1f
212+ nval , ltstep , ltcode , comp , qual = _splitcontrol (iarray [index ])
236213 deltat = _deltatime (ltstep , ltcode )
237- #check if next block will exceed array size and allocate additional chunk if needed
238- #if nval + array_index >= value_array.shape[0]:
239- # date_array = np.concatenate((date_array, np.zeros(array_chunk_size, dtype='M8[us]')))
240- # value_array = np.concatenate((value_array, np.zeros(array_chunk_size, dtype=np.float32)))
241-
242214 #compressed - only has single value which applies to full range
243215 if comp == 1 :
244- #TODO - verfiy we do not need support for code 7 or 100YRS? this is in freq but not the programmers documentation
245216 for i in range (0 , nval , 1 ):
246217 current_date = current_date + deltat
247218 date_array .append (current_date )
248219 value_array .append (farray [index + 1 ])
249220 index += 2
250221 offset += 2
251- #not compressed - read nval from floats (number of values)
252222 else :
253223 for i in range (0 , nval , 1 ):
254224 current_date = current_date + deltat
@@ -258,13 +228,9 @@ def _process_groups(iarray, farray, groups, tgroup):
258228 offset += 1 + nval
259229
260230 if offset >= 512 :
261- #print("offset:", str(offset))
262231 offset = 4
263232 index = (pscfwr - 1 ) * 512 + offset
264233 record = pscfwr
265- #pscbkr = iarray[(record - 1) * 512 + 2] #should always be 0 for first record in timeseries
266234 pscfwr = iarray [(record - 1 ) * 512 + 3 ] #should be 0 for last record in timeseries
267235
268- #df = pd.DataFrame({'date':date_array, 'values':value_array})
269- #df.to_csv(R'C:\users\ptomasula\desktop\debugging.csv')
270236 return date_array , value_array
0 commit comments