Source code for reperiods.find_RP.kmedoids

import numpy as np
import pandas as pd

from ..representative_periods import RepresentativePeriods

[docs] def kmedoids_method( data: pd.DataFrame, N_RP: int, RP_length: int ) -> list[RepresentativePeriods]: """Generate representative periods (RPs) using the k-medoids clustering method. Weights are calculated proportionally to the number of representatives in each cluster. Args: data (DataFrame): A DataFrame containing the data where RP will be found. N_RP (int): The number of representative periods to generate. RP_length (int): The length of each representative period. Returns: list: A list of RepresentativePeriods objects, each representing an RP with its weight. Raises: ImportError: If the scikit-learn-extra package is not installed. Please install it by running: pip install reperiods[kmedoids]. """ try: from sklearn_extra.cluster import KMedoids except ImportError: raise ImportError( "The kmedoids_method requires the scikit-learn-extra package, which is not installed. " "Please install it by running: pip install reperiods[kmedoids]" ) # Get RP candidates (not normalized) Number_of_candidate_RP = data.shape[0] // RP_length P_candidates = { P_id: data.iloc[P_id * RP_length : (P_id + 1) * RP_length] for P_id in range(Number_of_candidate_RP) } # Convert candidate data to a format suitable for k-medoids data = np.array( [ P_candidate.to_numpy().reshape((RP_length * data.shape[1]), order="F") for P_candidate in P_candidates.values() ] ) # Apply k-medoids clustering kmedoids = KMedoids(metric="euclidean", n_clusters=N_RP) kmedoids.fit(data) # Count the number of data points in each cluster (representative period) number_by_cluster = { P_id: (kmedoids.predict(data) == k).sum() for k, P_id in enumerate(kmedoids.medoid_indices_) } # Calculate weights for each representative period weights = [ number_by_cluster[P_id] / Number_of_candidate_RP for P_id in kmedoids.medoid_indices_ ] # Create RepresentativePeriods objects for the medoids with their weights representative_periods = [ RepresentativePeriods(data=P_candidates[P_id], weight=weights[i]) for i, P_id in enumerate(kmedoids.medoid_indices_) ] return representative_periods