imbalanced datasets loader

f0f6fef0 · Caron Olivier · 7171d424 · f0f6fef0
Commit f0f6fef0 authored 1 year ago by Caron Olivier
--- a/datasets/keel_imbalanced_dataset_preprocessing.py
+++ b/datasets/keel_imbalanced_dataset_preprocessing.py
+# -*- coding: utf-8 -*-
+# copyright: mary-morstan developers (see Authors.txt file), GPL v3 License (see LICENSE file)
+
+import pandas as pd
+from marymorstan.preprocessing.dataset_preprocessing import DatasetPreprocessing, UnknownDataset
+
+
+class MyDataSetPreprocessing(DatasetPreprocessing):
+    """This class allow to load an imbalanced dataset provided by KEEL library,
+        see : https://sci2s.ugr.es/keel/imbalanced.php"""
+    def load_dataset(self, dataset_location):
+        """ specific to the dataset origin, retrieve and return the dataset as a panda Dataframe"""
+        try:
+            v_names, lines_to_avoid = analyse_preamble(dataset_location)
+            if lines_to_avoid == -1:
+                raise f"'@data' string not found in CSV file{dataset_location}"
+            dataset = pd.read_csv(dataset_location, names=v_names, skiprows=lines_to_avoid)
+            return dataset
+        except Exception:
+            raise UnknownDataset(dataset_location)
+
+
+def analyse_preamble(dataset_location):
+    """ return the line number where "@data" string is found inside the dataset_location file,
+        return -1 if not found """
+    f = open(dataset_location, 'r')
+    names = []
+    line_num = 0
+    for line in f.readlines():
+        line_num += 1
+        list_element = line.split()
+        if list_element[0] == "@attribute":
+            names.append(list_element[1])
+        if list_element[0] == "@data":
+            return names, line_num
+    return [], -1
+
+
+
+
+