5
5
"""
6
6
import datetime
7
7
import glob
8
+ import io
8
9
import json
9
10
import os
10
11
from typing import Dict
27
28
from sed .loader .base .loader import BaseLoader
28
29
29
30
31
+ def load_h5_in_memory (file_path ):
32
+ """
33
+ Load an HDF5 file entirely into memory and open it with h5py.
34
+
35
+ Parameters:
36
+ file_path (str): Path to the .h5 file.
37
+
38
+ Returns:
39
+ h5py.File: An h5py File object representing the in-memory HDF5 file.
40
+ """
41
+ # Read the entire file into memory
42
+ with open (file_path , "rb" ) as f :
43
+ file_content = f .read ()
44
+
45
+ # Load the content into a BytesIO object
46
+ file_buffer = io .BytesIO (file_content )
47
+
48
+ # Open the HDF5 file using h5py from the in-memory buffer
49
+ h5_file = h5py .File (file_buffer , "r" )
50
+
51
+ return h5_file
52
+
53
+
30
54
def hdf5_to_dataframe (
31
55
files : Sequence [str ],
32
56
group_names : Sequence [str ] = None ,
@@ -67,7 +91,7 @@ def hdf5_to_dataframe(
67
91
68
92
# Read a file to parse the file structure
69
93
test_fid = kwds .pop ("test_fid" , 0 )
70
- test_proc = h5py . File (files [test_fid ])
94
+ test_proc = load_h5_in_memory (files [test_fid ])
71
95
if group_names == []:
72
96
group_names , alias_dict = get_groups_and_aliases (
73
97
h5file = test_proc ,
@@ -80,7 +104,7 @@ def hdf5_to_dataframe(
80
104
column_names .append (time_stamp_alias )
81
105
82
106
test_array = hdf5_to_array (
83
- h5file = test_proc ,
107
+ h5filename = files [ test_fid ] ,
84
108
group_names = group_names ,
85
109
time_stamps = time_stamps ,
86
110
ms_markers_group = ms_markers_group ,
@@ -94,7 +118,7 @@ def hdf5_to_dataframe(
94
118
arrays .append (
95
119
da .from_delayed (
96
120
dask .delayed (hdf5_to_array )(
97
- h5file = h5py . File ( f ) ,
121
+ h5filename = f ,
98
122
group_names = group_names ,
99
123
time_stamps = time_stamps ,
100
124
ms_markers_group = ms_markers_group ,
@@ -111,6 +135,8 @@ def hdf5_to_dataframe(
111
135
112
136
array_stack = da .concatenate (arrays , axis = 1 ).T
113
137
138
+ test_proc .close ()
139
+
114
140
return ddf .from_dask_array (array_stack , columns = column_names )
115
141
116
142
@@ -155,7 +181,7 @@ def hdf5_to_timed_dataframe(
155
181
156
182
# Read a file to parse the file structure
157
183
test_fid = kwds .pop ("test_fid" , 0 )
158
- test_proc = h5py . File (files [test_fid ])
184
+ test_proc = load_h5_in_memory (files [test_fid ])
159
185
if group_names == []:
160
186
group_names , alias_dict = get_groups_and_aliases (
161
187
h5file = test_proc ,
@@ -168,7 +194,7 @@ def hdf5_to_timed_dataframe(
168
194
column_names .append (time_stamp_alias )
169
195
170
196
test_array = hdf5_to_timed_array (
171
- h5file = test_proc ,
197
+ h5filename = files [ test_fid ] ,
172
198
group_names = group_names ,
173
199
time_stamps = time_stamps ,
174
200
ms_markers_group = ms_markers_group ,
@@ -182,7 +208,7 @@ def hdf5_to_timed_dataframe(
182
208
arrays .append (
183
209
da .from_delayed (
184
210
dask .delayed (hdf5_to_timed_array )(
185
- h5file = h5py . File ( f ) ,
211
+ h5filename = f ,
186
212
group_names = group_names ,
187
213
time_stamps = time_stamps ,
188
214
ms_markers_group = ms_markers_group ,
@@ -198,6 +224,8 @@ def hdf5_to_timed_dataframe(
198
224
199
225
array_stack = da .concatenate (arrays , axis = 1 ).T
200
226
227
+ test_proc .close ()
228
+
201
229
return ddf .from_dask_array (array_stack , columns = column_names )
202
230
203
231
@@ -237,7 +265,7 @@ def get_groups_and_aliases(
237
265
238
266
239
267
def hdf5_to_array (
240
- h5file : h5py . File ,
268
+ h5filename : str ,
241
269
group_names : Sequence [str ],
242
270
data_type : str = "float32" ,
243
271
time_stamps = False ,
@@ -248,14 +276,10 @@ def hdf5_to_array(
248
276
2-dimensional array with the corresponding values.
249
277
250
278
Args:
251
- h5file (h5py.File):
252
- hdf5 file handle to read from
253
- group_names (str):
254
- group names to read
255
- data_type (str, optional):
256
- Data type of the output data. Defaults to "float32".
257
- time_stamps (bool, optional):
258
- Option to calculate time stamps. Defaults to False.
279
+ h5filename (str): hdf5 file name to read from
280
+ group_names (str): group names to read
281
+ data_type (str, optional): Data type of the output data. Defaults to "float32".
282
+ time_stamps (bool, optional): Option to calculate time stamps. Defaults to False.
259
283
ms_markers_group (str): h5 column containing timestamp information.
260
284
Defaults to "msMarkers".
261
285
first_event_time_stamp_key (str): h5 attribute containing the start
@@ -267,6 +291,8 @@ def hdf5_to_array(
267
291
268
292
# Delayed array for loading an HDF5 file of reasonable size (e.g. < 1GB)
269
293
294
+ h5file = load_h5_in_memory (h5filename )
295
+
270
296
# Read out groups:
271
297
data_list = []
272
298
for group in group_names :
@@ -293,7 +319,7 @@ def hdf5_to_array(
293
319
except KeyError :
294
320
# get the start time of the file from its modification date if the key
295
321
# does not exist (old files)
296
- start_time = os .path .getmtime (h5file . filename ) # convert to ms
322
+ start_time = os .path .getmtime (h5filename ) # convert to ms
297
323
# the modification time points to the time when the file was finished, so we
298
324
# need to correct for the time it took to write the file
299
325
start_time -= len (ms_marker ) / 1000
@@ -316,11 +342,13 @@ def hdf5_to_array(
316
342
317
343
data_list .append (time_stamp_data )
318
344
345
+ h5file .close ()
346
+
319
347
return np .asarray (data_list )
320
348
321
349
322
350
def hdf5_to_timed_array (
323
- h5file : h5py . File ,
351
+ h5filename : str ,
324
352
group_names : Sequence [str ],
325
353
data_type : str = "float32" ,
326
354
time_stamps = False ,
@@ -331,14 +359,10 @@ def hdf5_to_timed_array(
331
359
timed version of a 2-dimensional array with the corresponding values.
332
360
333
361
Args:
334
- h5file (h5py.File):
335
- hdf5 file handle to read from
336
- group_names (str):
337
- group names to read
338
- data_type (str, optional):
339
- Data type of the output data. Defaults to "float32".
340
- time_stamps (bool, optional):
341
- Option to calculate time stamps. Defaults to False.
362
+ h5filename (str): hdf5 file name to read from
363
+ group_names (str): group names to read
364
+ data_type (str, optional): Data type of the output data. Defaults to "float32".
365
+ time_stamps (bool, optional): Option to calculate time stamps. Defaults to False.
342
366
ms_markers_group (str): h5 column containing timestamp information.
343
367
Defaults to "msMarkers".
344
368
first_event_time_stamp_key (str): h5 attribute containing the start
@@ -351,6 +375,8 @@ def hdf5_to_timed_array(
351
375
352
376
# Delayed array for loading an HDF5 file of reasonable size (e.g. < 1GB)
353
377
378
+ h5file = load_h5_in_memory (h5filename )
379
+
354
380
# Read out groups:
355
381
data_list = []
356
382
ms_marker = np .asarray (h5file [ms_markers_group ])
@@ -377,7 +403,7 @@ def hdf5_to_timed_array(
377
403
except KeyError :
378
404
# get the start time of the file from its modification date if the key
379
405
# does not exist (old files)
380
- start_time = os .path .getmtime (h5file . filename ) # convert to ms
406
+ start_time = os .path .getmtime (h5filename ) # convert to ms
381
407
# the modification time points to the time when the file was finished, so we
382
408
# need to correct for the time it took to write the file
383
409
start_time -= len (ms_marker ) / 1000
@@ -386,6 +412,8 @@ def hdf5_to_timed_array(
386
412
387
413
data_list .append (time_stamp_data )
388
414
415
+ h5file .close ()
416
+
389
417
return np .asarray (data_list )
390
418
391
419
@@ -692,16 +720,16 @@ def get_start_and_end_time(self) -> Tuple[float, float]:
692
720
Returns:
693
721
Tuple[float, float]: A tuple containing the start and end time stamps
694
722
"""
695
- h5file = h5py . File ( self .files [0 ])
723
+ h5filename = self .files [0 ]
696
724
timestamps = hdf5_to_array (
697
- h5file ,
725
+ h5filename = h5filename ,
698
726
group_names = self ._config ["dataframe" ]["hdf5_groupnames" ],
699
727
time_stamps = True ,
700
728
)
701
729
ts_from = timestamps [- 1 ][1 ]
702
- h5file = h5py . File ( self .files [- 1 ])
730
+ h5filename = self .files [- 1 ]
703
731
timestamps = hdf5_to_array (
704
- h5file ,
732
+ h5filename = h5filename ,
705
733
group_names = self ._config ["dataframe" ]["hdf5_groupnames" ],
706
734
time_stamps = True ,
707
735
)
@@ -929,7 +957,7 @@ def get_count_rate(
929
957
for fid in fids :
930
958
try :
931
959
count_rate_ , secs_ = get_count_rate (
932
- h5py . File (self .files [fid ]),
960
+ load_h5_in_memory (self .files [fid ]),
933
961
ms_markers_group = ms_markers_group ,
934
962
)
935
963
secs_list .append ((accumulated_time + secs_ ).T )
@@ -974,7 +1002,7 @@ def get_elapsed_time(self, fids: Sequence[int] = None, **kwds) -> float:
974
1002
for fid in fids :
975
1003
try :
976
1004
secs += get_elapsed_time (
977
- h5py . File (self .files [fid ]),
1005
+ load_h5_in_memory (self .files [fid ]),
978
1006
ms_markers_group = ms_markers_group ,
979
1007
)
980
1008
except OSError as exc :
0 commit comments