Source code for qf_lib.common.utils.data_cleaner

#     Copyright 2016-present CERN – European Organization for Nuclear Research
#
#     Licensed under the Apache License, Version 2.0 (the "License");
#     you may not use this file except in compliance with the License.
#     You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#     Unless required by applicable law or agreed to in writing, software
#     distributed under the License is distributed on an "AS IS" BASIS,
#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#     See the License for the specific language governing permissions and
#     limitations under the License.

from qf_lib.common.utils.dateutils.get_values_common_dates import get_values_for_common_dates
from qf_lib.common.utils.returns.beta_and_alpha import beta_and_alpha
from qf_lib.containers.dataframe.simple_returns_dataframe import SimpleReturnsDataFrame
from qf_lib.containers.series.cast_series import cast_series
from qf_lib.containers.series.qf_series import QFSeries


[docs]class DataCleaner: """ Cleans data which is partially incomplete, e.g. has gaps Parameters ---------- dataframe: SimpleReturnsDataFrame DataFrame of simple returns. If one column has more missing values than the threshold, it is removed from the result. threshold: float top limit of missing data. If the amount of missing data in a series exceeds this limit, the series will be removed. It is a relative value (e.g. 0.02, which corresponds to 2% of the data from the series). """ def __init__(self, dataframe: SimpleReturnsDataFrame, threshold: float = 0.05): assert isinstance(dataframe, SimpleReturnsDataFrame) self.dataframe = dataframe self.threshold = threshold self.incorrect_columns = [] # Columns which contain only NaN values. self.start_late_columns = {} # Columns which start late together with their start date. self.columns_with_holes = [] # Columns which have one or more NaN values in them.
[docs] def proxy_using_value(self, proxy_value: float) -> SimpleReturnsDataFrame: """ Removes columns from the DataFrame which have too many missing values. Then, the missing data in the remaining columns is completed using a given proxy_value. Parameters ---------- proxy_value: float value with which all the missing data should be filled Returns ------- SimpleReturnsDataFrame completed dataframe without missing data """ result_dataframe = self.dataframe.copy(deep=True) empty_values_idx = self.dataframe.isnull() self._drop_underfilled_columns(result_dataframe, empty_values_idx) result_dataframe[empty_values_idx] = proxy_value return result_dataframe
[docs] def proxy_using_regression(self, benchmark_tms: QFSeries, columns_type: type) -> SimpleReturnsDataFrame: """ Removes columns from the DataFrame which have too many missing values. Then, the missing data in the remaining columns is completed using regression with the benchmark. Parameters ---------- benchmark_tms: QFSeries benchmark used indirectly to proxy the missing data in the Dataframe. columns_type: type type of each column (e.g. PricesSeries, LogReturnsSeries) Returns ------- SimpleReturnsDataFrame completed dataframe. However it can still contain missing data, because sometimes it is not possible to complete all data using regression (e.g. for data that is missing in the original series there is no corresponding benchmark value). """ result_dataframe = self.dataframe.copy(deep=True) empty_values_idx = self.dataframe.isnull() self._drop_underfilled_columns(result_dataframe, empty_values_idx) self._use_regression_to_fill_missing_data( benchmark_tms, columns_type, result_dataframe, empty_values_idx) return result_dataframe
def _drop_underfilled_columns(self, result_dataframe, empty_values_idx): columns_to_delete = [] for column_name, is_empty_values in empty_values_idx.items(): empty_values_ratio = sum(is_empty_values) / len(is_empty_values) # #empty_values / #all_values if empty_values_ratio > self.threshold: columns_to_delete.append(column_name) first_valid_index = self.dataframe[column_name].first_valid_index() if empty_values_ratio == 1: self.incorrect_columns.append(column_name) elif self.dataframe[column_name][first_valid_index:].isnull().any(): self.columns_with_holes.append(column_name) elif first_valid_index != self.dataframe[column_name].index[0]: self.start_late_columns[column_name] = first_valid_index else: assert False, "Unknown reason for dropping column " + column_name empty_values_idx.drop(columns_to_delete, axis=1, inplace=True) result_dataframe.drop(columns_to_delete, axis=1, inplace=True) def _use_regression_to_fill_missing_data(self, benchmark_tms, columns_type, result_dataframe, empty_values_idx): num_of_columns = result_dataframe.shape[1] for i in range(num_of_columns): column = result_dataframe.iloc[:, i] nans_in_column_idx = empty_values_idx.iloc[:, i] beta, alpha = self._get_beta_and_alpha( benchmark_tms, column, columns_type, nans_in_column_idx) benchmark_common_tms, nans_common_idx = get_values_for_common_dates( benchmark_tms, nans_in_column_idx) benchmark_values_for_missing_dates = benchmark_common_tms[nans_common_idx] missing_values = beta * benchmark_values_for_missing_dates + alpha column[nans_in_column_idx] = missing_values return result_dataframe def _get_beta_and_alpha(self, benchmark_tms, column, columns_type, nans_in_column_idx): column_without_nans = column[~nans_in_column_idx] column_without_nans = cast_series(column_without_nans, columns_type) beta, alpha = beta_and_alpha(column_without_nans, benchmark_tms) return beta, alpha