@@ -14,65 +14,79 @@ import (
14
14
"github.com/apache/arrow/go/v14/parquet"
15
15
"github.com/apache/arrow/go/v14/parquet/file"
16
16
"github.com/apache/arrow/go/v14/parquet/pqarrow"
17
+
18
+ "github.com/elastic/elastic-agent-libs/logp"
17
19
)
18
20
19
21
// BufferedReader parses parquet inputs from io streams.
20
22
type BufferedReader struct {
21
23
cfg * Config
22
24
fileReader * file.Reader
23
25
recordReader pqarrow.RecordReader
26
+ log * logp.Logger
24
27
}
25
28
26
29
// NewBufferedReader creates a new reader that can decode parquet data from an io.Reader.
27
30
// It will return an error if the parquet data stream cannot be read.
28
31
// Note: As io.ReadAll is used, the entire data stream would be read into memory, so very large data streams
29
32
// may cause memory bottleneck issues.
30
33
func NewBufferedReader (r io.Reader , cfg * Config ) (* BufferedReader , error ) {
31
- batchSize := 1
32
- if cfg .BatchSize > 1 {
33
- batchSize = cfg .BatchSize
34
+ log := logp .L ().Named ("reader.parquet" )
35
+
36
+ if cfg .BatchSize == 0 {
37
+ cfg .BatchSize = 1
34
38
}
39
+ log .Debugw ("creating parquet reader" , "batch_size" , cfg .BatchSize )
35
40
36
41
// reads the contents of the reader object into a byte slice
37
42
data , err := io .ReadAll (r )
38
43
if err != nil {
39
44
return nil , fmt .Errorf ("failed to read data from stream reader: %w" , err )
40
45
}
46
+ log .Debugw ("read data from stream reader" , "size" , len (data ))
41
47
42
48
// defines a memory allocator for allocating memory for Arrow objects
43
49
pool := memory .NewCheckedAllocator (& memory.GoAllocator {})
44
-
50
+ // constructs a parquet file reader object from the byte slice data
45
51
pf , err := file .NewParquetReader (bytes .NewReader (data ), file .WithReadProps (parquet .NewReaderProperties (pool )))
46
52
if err != nil {
47
53
return nil , fmt .Errorf ("failed to create parquet reader: %w" , err )
48
54
}
55
+ log .Debugw ("created parquet reader" )
49
56
50
57
// constructs a reader for converting to Arrow objects from an existing parquet file reader object
51
58
reader , err := pqarrow .NewFileReader (pf , pqarrow.ArrowReadProperties {
52
59
Parallel : cfg .ProcessParallel ,
53
- BatchSize : int64 (batchSize ),
60
+ BatchSize : int64 (cfg . BatchSize ),
54
61
}, pool )
55
62
if err != nil {
56
63
return nil , fmt .Errorf ("failed to create pqarrow parquet reader: %w" , err )
57
64
}
65
+ log .Debugw ("created pqarrow parquet reader" )
58
66
59
67
// constructs a record reader that is capable of reding entire sets of arrow records
60
68
rr , err := reader .GetRecordReader (context .Background (), nil , nil )
61
69
if err != nil {
62
70
return nil , fmt .Errorf ("failed to create parquet record reader: %w" , err )
63
71
}
72
+ log .Debugw ("initialization process completed" )
64
73
65
74
return & BufferedReader {
66
75
cfg : cfg ,
67
76
recordReader : rr ,
68
77
fileReader : pf ,
78
+ log : log ,
69
79
}, nil
70
80
}
71
81
72
82
// Next advances the pointer to point to the next record and returns true if the next record exists.
73
83
// It will return false if there are no more records to read.
74
84
func (sr * BufferedReader ) Next () bool {
75
- return sr .recordReader .Next ()
85
+ next := sr .recordReader .Next ()
86
+ if ! next {
87
+ sr .log .Debugw ("no more records to read" , "next" , next )
88
+ }
89
+ return next
76
90
}
77
91
78
92
// Record reads the current record from the parquet file and returns it as a JSON marshaled byte slice.
@@ -81,13 +95,16 @@ func (sr *BufferedReader) Next() bool {
81
95
func (sr * BufferedReader ) Record () ([]byte , error ) {
82
96
rec := sr .recordReader .Record ()
83
97
if rec == nil {
98
+ sr .log .Debugw ("reached the end of the record reader" , "record_reader" , rec )
84
99
return nil , io .EOF
85
100
}
86
101
defer rec .Release ()
87
102
val , err := rec .MarshalJSON ()
88
103
if err != nil {
89
104
return nil , fmt .Errorf ("failed to marshal JSON for parquet value: %w" , err )
90
105
}
106
+ sr .log .Debugw ("records successfully read" , "batch_size" , sr .cfg .BatchSize )
107
+
91
108
return val , nil
92
109
}
93
110
0 commit comments