Skip to content

Commit 3675d28

Browse files
author
Thierry RAMORASOAVINA
committed
Preprocess the dataframe X (collection of input features) to remove newlines
To avoid any code duplication, the best place to perform the preprocessing is just before handing over to Khiops (when writing the csv file)
1 parent 85ce064 commit 3675d28

File tree

3 files changed

+60
-2
lines changed

3 files changed

+60
-2
lines changed

khiops/sklearn/dataset.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,10 @@ def write_internal_data_table(dataframe, file_path_or_stream):
404404
- UTF-8 encoding
405405
- The index is not written
406406
407+
Khiops cannot handle multi-line records.
408+
Hence, the carriage returns / line feeds need to be removed from the records
409+
before data is handed over to Khiops.
410+
407411
Parameters
408412
----------
409413
dataframe : `pandas.DataFrame`
@@ -412,6 +416,10 @@ def write_internal_data_table(dataframe, file_path_or_stream):
412416
The path of the internal data table file to be written or a writable file
413417
object.
414418
"""
419+
# Replace carriage returns / line feeds by blanks spaces
420+
# in order to always keep mono-lines text fields
421+
dataframe = dataframe.replace(["\r", "\n"], " ", regex=True)
422+
415423
dataframe.to_csv(
416424
file_path_or_stream,
417425
sep="\t",

khiops/sklearn/estimators.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,13 @@ def _cleanup_dir(target_dir):
223223
class KhiopsEstimator(ABC, BaseEstimator):
224224
"""Base class for Khiops Scikit-learn estimators
225225
226+
.. note::
227+
The input features collection X needs to have single-line records
228+
so that Khiops can handle them.
229+
Hence, multi-line records are preprocessed:
230+
carriage returns / line feeds are replaced
231+
with blank spaces before being handed over to Khiops.
232+
226233
Parameters
227234
----------
228235
verbose : bool, default ``False``
@@ -1695,7 +1702,7 @@ class KhiopsClassifier(ClassifierMixin, KhiopsPredictor):
16951702
.. note::
16961703
16971704
Visit `the Khiops site <https://khiops.org/learn/understand>`_ to learn
1698-
abouth the automatic feature engineering algorithm.
1705+
about the automatic feature engineering algorithm.
16991706
17001707
Parameters
17011708
----------

tests/test_dataset_class.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
# which is available at https://spdx.org/licenses/BSD-3-Clause-Clear.html or #
55
# see the "LICENSE.md" file for more details. #
66
######################################################################################
7-
"""Test consistency of the created files with the input data"""
87
import os
98
import shutil
109
import unittest
@@ -700,3 +699,47 @@ def _test_domain_coherence(self, ds, ref_var_types):
700699
for var in out_domain.get_dictionary(table.name).variables
701700
}
702701
self.assertEqual(ref_var_types[table.name], out_dictionary_var_types)
702+
703+
704+
class DataFramePreprocessingTests(unittest.TestCase):
705+
"""Check that the preprocessing of X (input features collection) is actually done
706+
when writing the csv used later by Khiops
707+
"""
708+
709+
def setUp(self):
710+
"""Set-up test-specific output directory"""
711+
self.output_dir = os.path.join("resources", "tmp", self._testMethodName)
712+
os.makedirs(self.output_dir, exist_ok=True)
713+
714+
def tearDown(self):
715+
"""Clean-up test-specific output directory"""
716+
shutil.rmtree(self.output_dir, ignore_errors=True)
717+
del self.output_dir
718+
719+
@staticmethod
720+
def create_monotable_dataset_with_newlines():
721+
data = {
722+
"User_ID": [
723+
"Cm6fu01r99",
724+
],
725+
"Age": [39],
726+
"Title": [
727+
"Shimmer,\nsurprisingly\n\rgoes with lots",
728+
],
729+
}
730+
dataset = pd.DataFrame(data)
731+
return dataset
732+
733+
def test_newlines_removed_from_csv_file_for_khiops(self):
734+
dataset = Dataset(
735+
DataFramePreprocessingTests.create_monotable_dataset_with_newlines()
736+
)
737+
738+
out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir)
739+
out_table = pd.read_csv(out_table_path, sep="\t")
740+
741+
self.assertEqual(
742+
"Shimmer, surprisingly goes with lots",
743+
out_table.Title[0],
744+
"Newlines should have been removed from the data",
745+
)

0 commit comments

Comments
 (0)