Skip to content

Commit 88cc526

Browse files
[libbeat][parquet reader] - Added debug logs & improved batch_size tracking (#40651) (#40673)
(cherry picked from commit 7f317fd) Co-authored-by: ShourieG <105607378+ShourieG@users.noreply.github.com>
1 parent 883886f commit 88cc526

File tree

3 files changed

+31
-6
lines changed

3 files changed

+31
-6
lines changed

CHANGELOG-developer.next.asciidoc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,10 @@ The list below covers the major changes between 7.0.0-rc2 and main only.
197197
- Move x-pack/filebeat/input/salesforce jwt import to v5. {pull}39823[39823]
198198
- Drop x-pack/filebeat/input dependency on github.com/lestrrat-go/jwx/v2. {pull}39968[39968]
199199
- Added `ignore_empty_values` flag in `decode_cef` Filebeat processor. {pull}40268[40268]
200+
- Bump version of elastic/toutoumomoma to remove internal forks of stdlib debug packages. {pull}40325[40325]
201+
- Refactor x-pack/filebeat/input/websocket for generalisation. {pull}40308[40308]
202+
- Add a configuration option for TCP/UDP network type. {issue}40407[40407] {pull}40623[40623]
203+
- Added debug logging to parquet reader in x-pack/libbeat/reader. {pull}40651[40651]
200204

201205
==== Deprecated
202206

x-pack/libbeat/reader/parquet/parquet.go

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,65 +14,79 @@ import (
1414
"github.com/apache/arrow/go/v14/parquet"
1515
"github.com/apache/arrow/go/v14/parquet/file"
1616
"github.com/apache/arrow/go/v14/parquet/pqarrow"
17+
18+
"github.com/elastic/elastic-agent-libs/logp"
1719
)
1820

1921
// BufferedReader parses parquet inputs from io streams.
2022
type BufferedReader struct {
2123
cfg *Config
2224
fileReader *file.Reader
2325
recordReader pqarrow.RecordReader
26+
log *logp.Logger
2427
}
2528

2629
// NewBufferedReader creates a new reader that can decode parquet data from an io.Reader.
2730
// It will return an error if the parquet data stream cannot be read.
2831
// Note: As io.ReadAll is used, the entire data stream would be read into memory, so very large data streams
2932
// may cause memory bottleneck issues.
3033
func NewBufferedReader(r io.Reader, cfg *Config) (*BufferedReader, error) {
31-
batchSize := 1
32-
if cfg.BatchSize > 1 {
33-
batchSize = cfg.BatchSize
34+
log := logp.L().Named("reader.parquet")
35+
36+
if cfg.BatchSize == 0 {
37+
cfg.BatchSize = 1
3438
}
39+
log.Debugw("creating parquet reader", "batch_size", cfg.BatchSize)
3540

3641
// reads the contents of the reader object into a byte slice
3742
data, err := io.ReadAll(r)
3843
if err != nil {
3944
return nil, fmt.Errorf("failed to read data from stream reader: %w", err)
4045
}
46+
log.Debugw("read data from stream reader", "size", len(data))
4147

4248
// defines a memory allocator for allocating memory for Arrow objects
4349
pool := memory.NewCheckedAllocator(&memory.GoAllocator{})
44-
50+
// constructs a parquet file reader object from the byte slice data
4551
pf, err := file.NewParquetReader(bytes.NewReader(data), file.WithReadProps(parquet.NewReaderProperties(pool)))
4652
if err != nil {
4753
return nil, fmt.Errorf("failed to create parquet reader: %w", err)
4854
}
55+
log.Debugw("created parquet reader")
4956

5057
// constructs a reader for converting to Arrow objects from an existing parquet file reader object
5158
reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{
5259
Parallel: cfg.ProcessParallel,
53-
BatchSize: int64(batchSize),
60+
BatchSize: int64(cfg.BatchSize),
5461
}, pool)
5562
if err != nil {
5663
return nil, fmt.Errorf("failed to create pqarrow parquet reader: %w", err)
5764
}
65+
log.Debugw("created pqarrow parquet reader")
5866

5967
// constructs a record reader that is capable of reding entire sets of arrow records
6068
rr, err := reader.GetRecordReader(context.Background(), nil, nil)
6169
if err != nil {
6270
return nil, fmt.Errorf("failed to create parquet record reader: %w", err)
6371
}
72+
log.Debugw("initialization process completed")
6473

6574
return &BufferedReader{
6675
cfg: cfg,
6776
recordReader: rr,
6877
fileReader: pf,
78+
log: log,
6979
}, nil
7080
}
7181

7282
// Next advances the pointer to point to the next record and returns true if the next record exists.
7383
// It will return false if there are no more records to read.
7484
func (sr *BufferedReader) Next() bool {
75-
return sr.recordReader.Next()
85+
next := sr.recordReader.Next()
86+
if !next {
87+
sr.log.Debugw("no more records to read", "next", next)
88+
}
89+
return next
7690
}
7791

7892
// Record reads the current record from the parquet file and returns it as a JSON marshaled byte slice.
@@ -81,13 +95,16 @@ func (sr *BufferedReader) Next() bool {
8195
func (sr *BufferedReader) Record() ([]byte, error) {
8296
rec := sr.recordReader.Record()
8397
if rec == nil {
98+
sr.log.Debugw("reached the end of the record reader", "record_reader", rec)
8499
return nil, io.EOF
85100
}
86101
defer rec.Release()
87102
val, err := rec.MarshalJSON()
88103
if err != nil {
89104
return nil, fmt.Errorf("failed to marshal JSON for parquet value: %w", err)
90105
}
106+
sr.log.Debugw("records successfully read", "batch_size", sr.cfg.BatchSize)
107+
91108
return val, nil
92109
}
93110

x-pack/libbeat/reader/parquet/parquet_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ import (
1919
"github.com/apache/arrow/go/v14/arrow/memory"
2020
"github.com/apache/arrow/go/v14/parquet/pqarrow"
2121
"github.com/stretchr/testify/assert"
22+
23+
"github.com/elastic/elastic-agent-libs/logp"
2224
)
2325

2426
// all test files are read from/stored within the "testdata" directory
@@ -55,6 +57,7 @@ func TestParquetWithRandomData(t *testing.T) {
5557
},
5658
}
5759

60+
logp.TestingSetup()
5861
for i, tc := range testCases {
5962
name := fmt.Sprintf("Test parquet files with rows=%d, and columns=%d", tc.rows, tc.columns)
6063
t.Run(name, func(t *testing.T) {
@@ -189,6 +192,7 @@ func TestParquetWithFiles(t *testing.T) {
189192
},
190193
}
191194

195+
logp.TestingSetup()
192196
for _, tc := range testCases {
193197
name := fmt.Sprintf("Test parquet files with source file=%s, and target comparison file=%s", tc.parquetFile, tc.jsonFile)
194198
t.Run(name, func(t *testing.T) {

0 commit comments

Comments
 (0)