Skip to content

Commit 359af8d

Browse files
authored
Merge pull request #430 from KhiopsML/413-multi-line-fields-in-dataframes
Preprocess the dataframe X to remove newlines
2 parents 85ce064 + 3675d28 commit 359af8d

File tree

3 files changed

+60
-2
lines changed

3 files changed

+60
-2
lines changed

khiops/sklearn/dataset.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,10 @@ def write_internal_data_table(dataframe, file_path_or_stream):
404404
- UTF-8 encoding
405405
- The index is not written
406406
407+
Khiops cannot handle multi-line records.
408+
Hence, the carriage returns / line feeds need to be removed from the records
409+
before data is handed over to Khiops.
410+
407411
Parameters
408412
----------
409413
dataframe : `pandas.DataFrame`
@@ -412,6 +416,10 @@ def write_internal_data_table(dataframe, file_path_or_stream):
412416
The path of the internal data table file to be written or a writable file
413417
object.
414418
"""
419+
# Replace carriage returns / line feeds by blanks spaces
420+
# in order to always keep mono-lines text fields
421+
dataframe = dataframe.replace(["\r", "\n"], " ", regex=True)
422+
415423
dataframe.to_csv(
416424
file_path_or_stream,
417425
sep="\t",

khiops/sklearn/estimators.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,13 @@ def _cleanup_dir(target_dir):
223223
class KhiopsEstimator(ABC, BaseEstimator):
224224
"""Base class for Khiops Scikit-learn estimators
225225
226+
.. note::
227+
The input features collection X needs to have single-line records
228+
so that Khiops can handle them.
229+
Hence, multi-line records are preprocessed:
230+
carriage returns / line feeds are replaced
231+
with blank spaces before being handed over to Khiops.
232+
226233
Parameters
227234
----------
228235
verbose : bool, default ``False``
@@ -1695,7 +1702,7 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
16951702
.. note::
16961703
16971704
Visit `the Khiops site <https://khiops.org/learn/understand>`_ to learn
1698-
abouth the automatic feature engineering algorithm.
1705+
about the automatic feature engineering algorithm.
16991706
17001707
Parameters
17011708
----------

tests/test_dataset_class.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
# which is available at https://spdx.org/licenses/BSD-3-Clause-Clear.html or #
55
# see the "LICENSE.md" file for more details. #
66
######################################################################################
7-
"""Test consistency of the created files with the input data"""
87
import os
98
import shutil
109
import unittest
@@ -700,3 +699,47 @@ def _test_domain_coherence(self, ds, ref_var_types):
700699
for var in out_domain.get_dictionary(table.name).variables
701700
}
702701
self.assertEqual(ref_var_types[table.name], out_dictionary_var_types)
702+
703+
704+
class DataFramePreprocessingTests(unittest.TestCase):
705+
"""Check that the preprocessing of X (input features collection) is actually done
706+
when writing the csv used later by Khiops
707+
"""
708+
709+
def setUp(self):
710+
"""Set-up test-specific output directory"""
711+
self.output_dir = os.path.join("resources", "tmp", self._testMethodName)
712+
os.makedirs(self.output_dir, exist_ok=True)
713+
714+
def tearDown(self):
715+
"""Clean-up test-specific output directory"""
716+
shutil.rmtree(self.output_dir, ignore_errors=True)
717+
del self.output_dir
718+
719+
@staticmethod
720+
def create_monotable_dataset_with_newlines():
721+
data = {
722+
"User_ID": [
723+
"Cm6fu01r99",
724+
],
725+
"Age": [39],
726+
"Title": [
727+
"Shimmer,\nsurprisingly\n\rgoes with lots",
728+
],
729+
}
730+
dataset = pd.DataFrame(data)
731+
return dataset
732+
733+
def test_newlines_removed_from_csv_file_for_khiops(self):
734+
dataset = Dataset(
735+
DataFramePreprocessingTests.create_monotable_dataset_with_newlines()
736+
)
737+
738+
out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir)
739+
out_table = pd.read_csv(out_table_path, sep="\t")
740+
741+
self.assertEqual(
742+
"Shimmer, surprisingly goes with lots",
743+
out_table.Title[0],
744+
"Newlines should have been removed from the data",
745+
)

0 commit comments

Comments
 (0)