Skip to content

Commit 07b369f

Browse files
authored
Merge pull request #421 from KhiopsML/400-exploit-a-hierarchical-multi-table-schema-for-sklearn-api
Add support for the new multi-table schema spec in the s…
2 parents 359af8d + e061a74 commit 07b369f

23 files changed

+692
-889
lines changed

doc/samples/samples_sklearn.rst

Lines changed: 18 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,8 @@ Samples
168168
169169
# Create the dataset spec and the target
170170
X = {
171-
"main_table": "Accidents",
172-
"tables": {
173-
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
171+
"main_table": (accidents_df.drop("Gravity", axis=1), ["AccidentId"]),
172+
"additional_data_tables": {
174173
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
175174
},
176175
}
@@ -224,18 +223,12 @@ Samples
224223
225224
# Build the multi-table dataset spec (drop the target column "Gravity")
226225
X = {
227-
"main_table": "Accidents",
228-
"tables": {
229-
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
226+
"main_table": (accidents_df.drop("Gravity", axis=1), ["AccidentId"]),
227+
"additional_data_tables": {
230228
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
231-
"Users": (users_df, ["AccidentId", "VehicleId"]),
232-
"Places": (places_df, "AccidentId"),
229+
"Vehicles/Users": (users_df, ["AccidentId", "VehicleId"]),
230+
"Places": (places_df, ["AccidentId"], True),
233231
},
234-
"relations": [
235-
("Accidents", "Vehicles"),
236-
("Vehicles", "Users"),
237-
("Accidents", "Places", True),
238-
],
239232
}
240233
241234
# Load the target variable "Gravity"
@@ -411,16 +404,14 @@ Samples
411404
# Create the dataset multitable specification for the train/test split
412405
# We specify each table with a name and a tuple (dataframe, key_columns)
413406
X_train = {
414-
"main_table": "Accidents",
415-
"tables": {
416-
"Accidents": (X_train_main, "AccidentId"),
407+
"main_table": (X_train_main, ["AccidentId"]),
408+
"additional_data_tables": {
417409
"Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]),
418410
},
419411
}
420412
X_test = {
421-
"main_table": "Accidents",
422-
"tables": {
423-
"Accidents": (X_test_main, "AccidentId"),
413+
"main_table": (X_test_main, ["AccidentId"]),
414+
"additional_data_tables": {
424415
"Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]),
425416
},
426417
}
@@ -557,9 +548,8 @@ Samples
557548
558549
# Build the multi-table dataset spec (drop the target column "Gravity")
559550
X = {
560-
"main_table": "Accidents",
561-
"tables": {
562-
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
551+
"main_table": (accidents_df.drop("Gravity", axis=1), ["AccidentId"]),
552+
"additional_data_tables": {
563553
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
564554
},
565555
}
@@ -596,18 +586,12 @@ Samples
596586
597587
# Build the multi-table dataset spec (drop the target column "Gravity")
598588
X = {
599-
"main_table": "Accidents",
600-
"tables": {
601-
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
589+
"main_table": (accidents_df.drop("Gravity", axis=1), ["AccidentId"]),
590+
"additional_data_tables": {
602591
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
603-
"Users": (users_df, ["AccidentId", "VehicleId"]),
604-
"Places": (places_df, "AccidentId"),
592+
"Vehicles/Users": (users_df, ["AccidentId", "VehicleId"]),
593+
"Places": (places_df, ["AccidentId"], True),
605594
},
606-
"relations": [
607-
("Accidents", "Vehicles"),
608-
("Vehicles", "Users"),
609-
("Accidents", "Places", True),
610-
],
611595
}
612596
613597
# Load the target variable "Gravity"
@@ -701,14 +685,10 @@ Samples
701685
702686
# Build the multi-table dataset spec (drop the target column "Gravity")
703687
X = {
704-
"main_table": "Accidents",
705-
"tables": {
706-
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
688+
"main_table": (accidents_df.drop("Gravity", axis=1), ["AccidentId"]),
689+
"additional_data_tables": {
707690
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
708691
},
709-
"relations": [
710-
("Accidents", "Vehicles"),
711-
],
712692
}
713693
714694
# Load the target variable "Gravity"

khiops/core/helpers.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@
2323

2424

2525
def _build_multi_table_dictionary_domain(
26-
dictionary_domain, root_dictionary_name, secondary_table_variable_name
26+
dictionary_domain,
27+
root_dictionary_name,
28+
secondary_table_variable_name,
29+
update_secondary_table_name=False,
2730
):
2831
"""Builds a multi-table dictionary domain from a dictionary with a key
2932
Parameters
@@ -34,6 +37,9 @@ def _build_multi_table_dictionary_domain(
3437
Name for the new root dictionary
3538
secondary_table_variable_name : str
3639
Name, in the root dictionary, for the "table" variable of the secondary table.
40+
update_secondary_table_name : bool, default `False`
41+
If ``True``, then update the secondary table name according to the
42+
secondary table variable name. If not set, keep original table name.
3743
3844
Returns
3945
-------
@@ -103,11 +109,17 @@ def _build_multi_table_dictionary_domain(
103109
target_variable = Variable()
104110
target_variable.name = secondary_table_variable_name
105111
target_variable.type = "Table"
106-
target_variable.object_type = root_source_dictionary.name
112+
if update_secondary_table_name:
113+
target_variable.object_type = secondary_table_variable_name
114+
else:
115+
target_variable.object_type = root_source_dictionary.name
107116
root_target_dictionary.add_variable(target_variable)
108117

109118
# Build secondary target dictionary, by copying root source dictionary
110119
secondary_target_dictionary = root_source_dictionary.copy()
120+
secondary_target_dictionary.root = False
121+
if update_secondary_table_name:
122+
secondary_target_dictionary.name = secondary_table_variable_name
111123

112124
# Build target domain and add dictionaries to it
113125
target_domain = DictionaryDomain()

khiops/samples/samples_sklearn.ipynb

Lines changed: 18 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,8 @@
180180
"\n",
181181
"# Create the dataset spec and the target\n",
182182
"X = {\n",
183-
" \"main_table\": \"Accidents\",\n",
184-
" \"tables\": {\n",
185-
" \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n",
183+
" \"main_table\": (accidents_df.drop(\"Gravity\", axis=1), [\"AccidentId\"]),\n",
184+
" \"additional_data_tables\": {\n",
186185
" \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n",
187186
" },\n",
188187
"}\n",
@@ -249,18 +248,12 @@
249248
"\n",
250249
"# Build the multi-table dataset spec (drop the target column \"Gravity\")\n",
251250
"X = {\n",
252-
" \"main_table\": \"Accidents\",\n",
253-
" \"tables\": {\n",
254-
" \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n",
251+
" \"main_table\": (accidents_df.drop(\"Gravity\", axis=1), [\"AccidentId\"]),\n",
252+
" \"additional_data_tables\": {\n",
255253
" \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n",
256-
" \"Users\": (users_df, [\"AccidentId\", \"VehicleId\"]),\n",
257-
" \"Places\": (places_df, \"AccidentId\"),\n",
254+
" \"Vehicles/Users\": (users_df, [\"AccidentId\", \"VehicleId\"]),\n",
255+
" \"Places\": (places_df, [\"AccidentId\"], True),\n",
258256
" },\n",
259-
" \"relations\": [\n",
260-
" (\"Accidents\", \"Vehicles\"),\n",
261-
" (\"Vehicles\", \"Users\"),\n",
262-
" (\"Accidents\", \"Places\", True),\n",
263-
" ],\n",
264257
"}\n",
265258
"\n",
266259
"# Load the target variable \"Gravity\"\n",
@@ -475,16 +468,14 @@
475468
"# Create the dataset multitable specification for the train/test split\n",
476469
"# We specify each table with a name and a tuple (dataframe, key_columns)\n",
477470
"X_train = {\n",
478-
" \"main_table\": \"Accidents\",\n",
479-
" \"tables\": {\n",
480-
" \"Accidents\": (X_train_main, \"AccidentId\"),\n",
471+
" \"main_table\": (X_train_main, [\"AccidentId\"]),\n",
472+
" \"additional_data_tables\": {\n",
481473
" \"Vehicles\": (X_train_secondary, [\"AccidentId\", \"VehicleId\"]),\n",
482474
" },\n",
483475
"}\n",
484476
"X_test = {\n",
485-
" \"main_table\": \"Accidents\",\n",
486-
" \"tables\": {\n",
487-
" \"Accidents\": (X_test_main, \"AccidentId\"),\n",
477+
" \"main_table\": (X_test_main, [\"AccidentId\"]),\n",
478+
" \"additional_data_tables\": {\n",
488479
" \"Vehicles\": (X_test_secondary, [\"AccidentId\", \"VehicleId\"]),\n",
489480
" },\n",
490481
"}\n",
@@ -660,9 +651,8 @@
660651
"\n",
661652
"# Build the multi-table dataset spec (drop the target column \"Gravity\")\n",
662653
"X = {\n",
663-
" \"main_table\": \"Accidents\",\n",
664-
" \"tables\": {\n",
665-
" \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n",
654+
" \"main_table\": (accidents_df.drop(\"Gravity\", axis=1), [\"AccidentId\"]),\n",
655+
" \"additional_data_tables\": {\n",
666656
" \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n",
667657
" },\n",
668658
"}\n",
@@ -712,18 +702,12 @@
712702
"\n",
713703
"# Build the multi-table dataset spec (drop the target column \"Gravity\")\n",
714704
"X = {\n",
715-
" \"main_table\": \"Accidents\",\n",
716-
" \"tables\": {\n",
717-
" \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n",
705+
" \"main_table\": (accidents_df.drop(\"Gravity\", axis=1), [\"AccidentId\"]),\n",
706+
" \"additional_data_tables\": {\n",
718707
" \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n",
719-
" \"Users\": (users_df, [\"AccidentId\", \"VehicleId\"]),\n",
720-
" \"Places\": (places_df, \"AccidentId\"),\n",
708+
" \"Vehicles/Users\": (users_df, [\"AccidentId\", \"VehicleId\"]),\n",
709+
" \"Places\": (places_df, [\"AccidentId\"], True),\n",
721710
" },\n",
722-
" \"relations\": [\n",
723-
" (\"Accidents\", \"Vehicles\"),\n",
724-
" (\"Vehicles\", \"Users\"),\n",
725-
" (\"Accidents\", \"Places\", True),\n",
726-
" ],\n",
727711
"}\n",
728712
"\n",
729713
"# Load the target variable \"Gravity\"\n",
@@ -843,14 +827,10 @@
843827
"\n",
844828
"# Build the multi-table dataset spec (drop the target column \"Gravity\")\n",
845829
"X = {\n",
846-
" \"main_table\": \"Accidents\",\n",
847-
" \"tables\": {\n",
848-
" \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n",
830+
" \"main_table\": (accidents_df.drop(\"Gravity\", axis=1), [\"AccidentId\"]),\n",
831+
" \"additional_data_tables\": {\n",
849832
" \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n",
850833
" },\n",
851-
" \"relations\": [\n",
852-
" (\"Accidents\", \"Vehicles\"),\n",
853-
" ],\n",
854834
"}\n",
855835
"\n",
856836
"# Load the target variable \"Gravity\"\n",

khiops/samples/samples_sklearn.py

Lines changed: 18 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,8 @@ def khiops_classifier_multitable_star():
163163

164164
# Create the dataset spec and the target
165165
X = {
166-
"main_table": "Accidents",
167-
"tables": {
168-
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
166+
"main_table": (accidents_df.drop("Gravity", axis=1), ["AccidentId"]),
167+
"additional_data_tables": {
169168
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
170169
},
171170
}
@@ -224,18 +223,12 @@ def khiops_classifier_multitable_snowflake():
224223

225224
# Build the multi-table dataset spec (drop the target column "Gravity")
226225
X = {
227-
"main_table": "Accidents",
228-
"tables": {
229-
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
226+
"main_table": (accidents_df.drop("Gravity", axis=1), ["AccidentId"]),
227+
"additional_data_tables": {
230228
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
231-
"Users": (users_df, ["AccidentId", "VehicleId"]),
232-
"Places": (places_df, "AccidentId"),
229+
"Vehicles/Users": (users_df, ["AccidentId", "VehicleId"]),
230+
"Places": (places_df, ["AccidentId"], True),
233231
},
234-
"relations": [
235-
("Accidents", "Vehicles"),
236-
("Vehicles", "Users"),
237-
("Accidents", "Places", True),
238-
],
239232
}
240233

241234
# Load the target variable "Gravity"
@@ -416,16 +409,14 @@ def khiops_classifier_with_hyperparameters():
416409
# Create the dataset multitable specification for the train/test split
417410
# We specify each table with a name and a tuple (dataframe, key_columns)
418411
X_train = {
419-
"main_table": "Accidents",
420-
"tables": {
421-
"Accidents": (X_train_main, "AccidentId"),
412+
"main_table": (X_train_main, ["AccidentId"]),
413+
"additional_data_tables": {
422414
"Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]),
423415
},
424416
}
425417
X_test = {
426-
"main_table": "Accidents",
427-
"tables": {
428-
"Accidents": (X_test_main, "AccidentId"),
418+
"main_table": (X_test_main, ["AccidentId"]),
419+
"additional_data_tables": {
429420
"Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]),
430421
},
431422
}
@@ -578,9 +569,8 @@ def khiops_encoder_multitable_star():
578569

579570
# Build the multi-table dataset spec (drop the target column "Gravity")
580571
X = {
581-
"main_table": "Accidents",
582-
"tables": {
583-
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
572+
"main_table": (accidents_df.drop("Gravity", axis=1), ["AccidentId"]),
573+
"additional_data_tables": {
584574
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
585575
},
586576
}
@@ -622,18 +612,12 @@ def khiops_encoder_multitable_snowflake():
622612

623613
# Build the multi-table dataset spec (drop the target column "Gravity")
624614
X = {
625-
"main_table": "Accidents",
626-
"tables": {
627-
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
615+
"main_table": (accidents_df.drop("Gravity", axis=1), ["AccidentId"]),
616+
"additional_data_tables": {
628617
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
629-
"Users": (users_df, ["AccidentId", "VehicleId"]),
630-
"Places": (places_df, "AccidentId"),
618+
"Vehicles/Users": (users_df, ["AccidentId", "VehicleId"]),
619+
"Places": (places_df, ["AccidentId"], True),
631620
},
632-
"relations": [
633-
("Accidents", "Vehicles"),
634-
("Vehicles", "Users"),
635-
("Accidents", "Places", True),
636-
],
637621
}
638622

639623
# Load the target variable "Gravity"
@@ -739,14 +723,10 @@ def khiops_encoder_with_hyperparameters():
739723

740724
# Build the multi-table dataset spec (drop the target column "Gravity")
741725
X = {
742-
"main_table": "Accidents",
743-
"tables": {
744-
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
726+
"main_table": (accidents_df.drop("Gravity", axis=1), ["AccidentId"]),
727+
"additional_data_tables": {
745728
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
746729
},
747-
"relations": [
748-
("Accidents", "Vehicles"),
749-
],
750730
}
751731

752732
# Load the target variable "Gravity"

0 commit comments

Comments
 (0)