Skip to content

Commit 2e205f1

Browse files
committed
tidy up readWDM
General refactoring to cleanup code by removing old comments and slight restructuring to increase readability. Also replace print statements for error with raise exceptions.
1 parent 2561869 commit 2e205f1

1 file changed

Lines changed: 19 additions & 53 deletions

File tree

HSP2tools/readWDM.py

Lines changed: 19 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@
77

88
import numpy as np
99
import pandas as pd
10-
from numba import jit
10+
from numba import jit, njit
1111
import datetime
1212
from dateutil.relativedelta import relativedelta
1313
import timeit
1414

15-
1615
# look up attributes NAME, data type (Integer; Real; String) and data length by attribute number
1716
attrinfo = {1:('TSTYPE','S',4), 2:('STAID','S',16), 11:('DAREA','R',1),
1817
17:('TCODE','I',1), 27:('TSBYR','I',1), 28:('TSBMO','I',1),
@@ -27,14 +26,12 @@
2726
freq = {7:'100YS', 6:'YS', 5:'MS', 4:'D', 3:'H', 2:'min', 1:'S'} # pandas date_range() frequency by TCODE, TGROUP
2827

2928

30-
31-
def readWDM(wdmfile, hdffile, jupyterlab=True):
29+
def readWDM(wdmfile, hdffile, compress_output=True):
3230
iarray = np.fromfile(wdmfile, dtype=np.int32)
3331
farray = np.fromfile(wdmfile, dtype=np.float32)
3432

3533
if iarray[0] != -998:
36-
print('Not a WDM file, magic number is not -990. Stopping!')
37-
return
34+
raise ValueError ('Provided file does not match WDM format. First int32 should be -998.')
3835
nrecords = iarray[28] # first record is File Definition Record
3936
ntimeseries = iarray[31]
4037

@@ -43,7 +40,7 @@ def readWDM(wdmfile, hdffile, jupyterlab=True):
4340
if not (iarray[index]==0 and iarray[index+1]==0 and iarray[index+2]==0 and iarray[index+3]==0) and iarray[index+5]==1:
4441
dsnlist.append(index)
4542
if len(dsnlist) != ntimeseries:
46-
print('PROGRAM ERROR, wrong number of DSN records found')
43+
raise RuntimeError (f'Wrong number of Time Series Records found expecting:{ntimeseries} found:{len(dsnlist)}')
4744

4845
with pd.HDFStore(hdffile) as store:
4946
summary = []
@@ -105,13 +102,13 @@ def readWDM(wdmfile, hdffile, jupyterlab=True):
105102
dattr[name] = ''.join([_inttostr(iarray[k]) for k in range(ptr, ptr + length//4)]).strip()
106103

107104
# Get timeseries timebase data
108-
groups = [] #renaming to groups to be consistent with WDM documentation
105+
groups = []
109106
for i in range(pdat+1, pdatv-1):
110107
a = iarray[index+i]
111108
if a != 0:
112109
groups.append(_splitposition(a))
113110
if len(groups) == 0:
114-
continue # WDM preallocation, but nothing saved here yet
111+
continue
115112

116113
srec, soffset = groups[0]
117114
start = _splitdate(iarray[srec*512 + soffset])
@@ -130,28 +127,18 @@ def readWDM(wdmfile, hdffile, jupyterlab=True):
130127
tindex = pd.date_range(start=start, end=cindex[-1], freq=str(tstep) + freq[tcode])
131128
counts = np.diff(np.searchsorted(tindex, cindex))
132129

133-
## Get timeseries data
134-
#floats = np.zeros(sum(counts), dtype=np.float32)
135-
#findex = 0
136-
#for (rec,offset),count in zip(groups, counts):
137-
#findex = getfloats(iarray, farray, floats, findex, rec, offset, count, finalindex, tcode, tstep)
138-
139-
#replaced with group/block processing approach
140-
dates, values = _process_groups(iarray, farray, groups, tgroup)
141-
142130
## Write to HDF5 file
131+
dates, values = _process_groups(iarray, farray, groups, tgroup)
143132
series = pd.Series(values, index=dates)
144133
dsname = f'TIMESERIES/TS{dsn:03d}'
145-
# series.to_hdf(store, dsname, complib='blosc', complevel=9)
146-
if jupyterlab:
147-
series.to_hdf(store, dsname, complib='blosc', complevel=9) # This is the official version
134+
if compress_output:
135+
series.to_hdf(store, dsname, complib='blosc', complevel=9)
148136
else:
149-
series.to_hdf(store, dsname, format='t', data_columns=True) # show the columns in HDFView
137+
series.to_hdf(store, dsname, format='t', data_columns=True)
150138

151139
data = [str(tindex[0]), str(tindex[-1]), str(tstep) + freq[tcode],
152140
len(series), dattr['TSTYPE'], dattr['TFILL']]
153141
columns = ['Start', 'Stop', 'Freq','Length', 'TSTYPE', 'TFILL']
154-
# search = ['STAID', 'STNAM', 'SCENARIO', 'CONSTITUENT','LOCATION']
155142
for x in columns_to_add:
156143
if x in dattr:
157144
data.append(dattr[x])
@@ -160,30 +147,29 @@ def readWDM(wdmfile, hdffile, jupyterlab=True):
160147
summary.append(data)
161148
summaryindx.append(dsname[11:])
162149

163-
164150
dfsummary = pd.DataFrame(summary, index=summaryindx, columns=columns)
165151
store.put('TIMESERIES/SUMMARY',dfsummary, format='t', data_columns=True)
166152
return dfsummary
167153

168-
169154
def _todatetime(yr=1900, mo=1, dy=1, hr=0):
170-
'''takes yr,mo,dy,hr information then returns its datetime64'''
171-
if hr == 24:
155+
if hr == 24:
172156
return datetime.datetime(yr, mo, dy, 23) + pd.Timedelta(1,'h')
173157
else:
174158
return datetime.datetime(yr, mo, dy, hr)
175159

176160
def _splitdate(x):
177-
'''splits WDM int32 DATWRD into year, month, day, hour -> then returns its datetime64'''
178161
return _todatetime(x >> 14, x >> 10 & 0xF, x >> 5 & 0x1F, x & 0x1F) # args: year, month, day, hour
179162

180163
def _splitcontrol(x):
181-
''' splits int32 into (qual, compcode, units, tstep, nvalues)'''
182-
return(x & 0x1F, x >> 5 & 0x3, x >> 7 & 0x7, x >> 10 & 0x3F, x >> 16)
164+
nval = x >> 16
165+
ltstep = int(x >> 10 & 0x3f) #relative_delta doesn't handle numpy.32bitInt correctly so convert to python
166+
ltcode = int(x >> 7 & 0x7)
167+
comp = x >> 5 & 0x3
168+
qual = x & 0x1f
169+
return nval, ltstep, ltcode, comp, qual
183170

184171
def _splitposition(x):
185-
''' splits int32 into (record, offset), converting to Pyton zero based indexing'''
186-
return((x>>9) - 1, (x&0x1FF) - 1)
172+
return((x>>9) - 1, (x&0x1FF) - 1) #args: record, offset
187173

188174
def _inttostr(i):
189175
return chr(i & 0xFF) + chr(i>>8 & 0xFF) + chr(i>>16 & 0xFF) + chr(i>>24 & 0xFF)
@@ -209,9 +195,6 @@ def _deltatime(ltstep, ltcode):
209195
seconds= ltstep if ltcode == 1 else 0)
210196
return deltat
211197

212-
# HTAO - an alternative implementation of process_group:
213-
# 1. used lists to replace numpy matrix;
214-
# 2. added a loop to iterate each group and used ending date as the ending condition
215198
def _process_groups(iarray, farray, groups, tgroup):
216199

217200
date_array = []
@@ -226,29 +209,16 @@ def _process_groups(iarray, farray, groups, tgroup):
226209
index +=1
227210

228211
while current_date < lGroupEndDate:
229-
#read block control word
230-
x = iarray[index] # block control word or maybe date word at start of group
231-
nval = x >> 16
232-
ltstep = int(x >> 10 & 0x3f) #relative_delta doesn't handle numpy.32bitInt correctly so convert to python
233-
ltcode = int(x >> 7 & 0x7)
234-
comp = x >> 5 & 0x3
235-
qual = x & 0x1f
212+
nval, ltstep, ltcode, comp, qual = _splitcontrol(iarray[index])
236213
deltat = _deltatime(ltstep, ltcode)
237-
#check if next block will exceed array size and allocate additional chunk if needed
238-
#if nval + array_index >= value_array.shape[0]:
239-
# date_array = np.concatenate((date_array, np.zeros(array_chunk_size, dtype='M8[us]')))
240-
# value_array = np.concatenate((value_array, np.zeros(array_chunk_size, dtype=np.float32)))
241-
242214
#compressed - only has single value which applies to full range
243215
if comp == 1:
244-
#TODO - verfiy we do not need support for code 7 or 100YRS? this is in freq but not the programmers documentation
245216
for i in range(0, nval, 1):
246217
current_date = current_date + deltat
247218
date_array.append(current_date)
248219
value_array.append(farray[index + 1])
249220
index += 2
250221
offset +=2
251-
#not compressed - read nval from floats (number of values)
252222
else:
253223
for i in range(0, nval, 1):
254224
current_date = current_date + deltat
@@ -258,13 +228,9 @@ def _process_groups(iarray, farray, groups, tgroup):
258228
offset +=1 + nval
259229

260230
if offset >= 512:
261-
#print("offset:", str(offset))
262231
offset = 4
263232
index = (pscfwr - 1) * 512 + offset
264233
record = pscfwr
265-
#pscbkr = iarray[(record - 1) * 512 + 2] #should always be 0 for first record in timeseries
266234
pscfwr = iarray[(record - 1) * 512 + 3] #should be 0 for last record in timeseries
267235

268-
#df = pd.DataFrame({'date':date_array, 'values':value_array})
269-
#df.to_csv(R'C:\users\ptomasula\desktop\debugging.csv')
270236
return date_array, value_array

0 commit comments

Comments
 (0)