Skip to content
Snippets Groups Projects
Commit f0f6fef0 authored by Caron Olivier's avatar Caron Olivier
Browse files

imbalanced datasets loader

parent 7171d424
No related branches found
No related tags found
No related merge requests found
# -*- coding: utf-8 -*-
# copyright: mary-morstan developers (see Authors.txt file), GPL v3 License (see LICENSE file)
import pandas as pd
from marymorstan.preprocessing.dataset_preprocessing import DatasetPreprocessing, UnknownDataset
class MyDataSetPreprocessing(DatasetPreprocessing):
"""This class allow to load an imbalanced dataset provided by KEEL library,
see : https://sci2s.ugr.es/keel/imbalanced.php"""
def load_dataset(self, dataset_location):
""" specific to the dataset origin, retrieve and return the dataset as a panda Dataframe"""
try:
v_names, lines_to_avoid = analyse_preamble(dataset_location)
if lines_to_avoid == -1:
raise f"'@data' string not found in CSV file{dataset_location}"
dataset = pd.read_csv(dataset_location, names=v_names, skiprows=lines_to_avoid)
return dataset
except Exception:
raise UnknownDataset(dataset_location)
def analyse_preamble(dataset_location):
""" return the line number where "@data" string is found inside the dataset_location file,
return -1 if not found """
f = open(dataset_location, 'r')
names = []
line_num = 0
for line in f.readlines():
line_num += 1
list_element = line.split()
if list_element[0] == "@attribute":
names.append(list_element[1])
if list_element[0] == "@data":
return names, line_num
return [], -1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment