Source code for qf_lib.data_providers.helpers

#     Copyright 2016-present CERN – European Organization for Nuclear Research
#
#     Licensed under the Apache License, Version 2.0 (the "License");
#     you may not use this file except in compliance with the License.
#     You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#     Unless required by applicable law or agreed to in writing, software
#     distributed under the License is distributed on an "AS IS" BASIS,
#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#     See the License for the specific language governing permissions and
#     limitations under the License.
import warnings
from datetime import datetime
from typing import Union, Dict, Sequence, Any
import pandas as pd
from pandas import DatetimeIndex
from xarray import DataArray

from qf_lib.common.tickers.tickers import Ticker
from qf_lib.common.utils.dateutils.relative_delta import RelativeDelta
from qf_lib.common.utils.miscellaneous.to_list_conversion import convert_to_list
from qf_lib.containers.dataframe.cast_dataframe import cast_dataframe
from qf_lib.containers.dataframe.prices_dataframe import PricesDataFrame
from qf_lib.containers.dataframe.qf_dataframe import QFDataFrame
from qf_lib.containers.dimension_names import DATES, TICKERS, FIELDS
from qf_lib.containers.futures.future_tickers.future_ticker import FutureTicker
from qf_lib.containers.qf_data_array import QFDataArray
from qf_lib.containers.series.cast_series import cast_series
from qf_lib.containers.series.prices_series import PricesSeries
from qf_lib.containers.series.qf_series import QFSeries


[docs]def normalize_data_array(
        data_array, tickers, fields, got_single_date, got_single_ticker, got_single_field, use_prices_types=False) \
        -> Union[QFSeries, QFDataFrame, QFDataArray, PricesSeries, PricesDataFrame]:
    """
    Post-processes the result of some DataProviders so that it satisfies the format of a result expected
    from DataProviders. Expected format rules should cover the following:
    - proper return type (QFSeries/PricesSeries, QFDataFrame/PricesDataFrame, QFDataArray),
    - proper shape of the result (squeezed dimensions for which a single non-list value was provided, e.g. "OPEN"),
    - dimensions: TICKERS and FIELDS contain all required labels and the labels are in required order.

    Parameters
    ----------
    data_array
        data_array to be normalized
    tickers
        list of tickers requested by the caller
    fields
        list of fields requested by the caller
    got_single_date
        True if a single (scalar value) date was requested (start_date==end_date); False otherwise
    got_single_ticker
        True if a single (scalar value) ticker was requested (e.g. "MSFT US Equity"); False otherwise
    got_single_field
        True if a single (scalar value) field was requested (e.g. "OPEN"); False otherwise
    use_prices_types
        if True then proper return types are: PricesSeries, PricesDataFrame or QFDataArray;
        otherwise return types are: QFSeries, QFDataFrame or QFDataArray

    Returns
    --------
    QFSeries, QFDataFrame, QFDataArray, PricesSeries, PricesDataFrame
    """
    # to keep the order of tickers and fields we reindex the data_array
    if data_array.tickers.values.tolist() != tickers:
        data_array = data_array.reindex(tickers=tickers)
    if data_array.fields.values.tolist() != fields:
        data_array = data_array.reindex(fields=fields)

    data_array = data_array.dropna(DATES, how='all')
    squeezed_and_casted_result = squeeze_data_array_and_cast_to_proper_type(data_array, got_single_date,
                                                                            got_single_ticker,
                                                                            got_single_field,
                                                                            use_prices_types)

    return squeezed_and_casted_result


def squeeze_data_array_and_cast_to_proper_type(original_data_array: QFDataArray, got_single_date: bool,
                                               got_single_ticker: bool, got_single_field: bool, use_prices_types: bool):
    if isinstance(original_data_array, DataArray) and not isinstance(original_data_array, QFDataArray):
        warnings.warn("data_array to be normalized should be a QFDataFrame instance. "
                      "Transforming data_array to QFDataArray. Please check types in the future.")
        original_data_array = QFDataArray.from_xr_data_array(original_data_array)

    dimensions_to_squeeze = []
    if got_single_date:
        dimensions_to_squeeze.append(DATES)
    if got_single_ticker:
        dimensions_to_squeeze.append(TICKERS)
    if got_single_field:
        dimensions_to_squeeze.append(FIELDS)

    container = original_data_array
    if dimensions_to_squeeze:
        if original_data_array.size == 0:  # empty
            container = QFDataFrame(index=original_data_array[TICKERS].values,
                                    columns=original_data_array[FIELDS].values)
            container.index.name = TICKERS
            container.columns.name = FIELDS

            if use_prices_types:
                container = PricesDataFrame(container)
            if got_single_field:
                container = container.squeeze(axis=1)
                container.name = original_data_array[FIELDS].values[0]
            if got_single_ticker:
                container = container.squeeze(axis=0)
            if not got_single_date:
                dates = DatetimeIndex([], name=DATES)
                if got_single_ticker and got_single_field:
                    container = QFSeries(index=DatetimeIndex([], name=DATES))
                if use_prices_types:
                    container = PricesSeries(container)
                if not got_single_ticker or not got_single_field:
                    container = container.to_frame().T.reindex(dates)
                    container.index.name = DATES
        else:
            container = original_data_array.squeeze(dimensions_to_squeeze)

    if len(dimensions_to_squeeze) < 3:
        if got_single_ticker:
            ticker = original_data_array.tickers[0].item()
            container.name = ticker.as_string()
        elif got_single_field:
            container.name = original_data_array.fields[0].item()

    if isinstance(container, QFDataArray):
        container = cast_data_array_to_proper_type(container, use_prices_types)

    return container


def cast_data_array_to_proper_type(result: QFDataArray, use_prices_types=False):
    if use_prices_types:
        series_type = PricesSeries
        data_frame_type = PricesDataFrame
    else:
        series_type = QFSeries
        data_frame_type = QFDataFrame

    num_of_dimensions = len(result.shape)
    if num_of_dimensions == 0:
        casted_result = result.item()
    elif num_of_dimensions == 1:
        casted_result = cast_series(result.to_pandas(), series_type)
        casted_result.name = result.name
    elif num_of_dimensions == 2:
        casted_result = cast_dataframe(result.to_pandas(), data_frame_type)
    else:
        casted_result = result

    return casted_result


def cast_dataframe_to_proper_type(result):
    num_of_dimensions = len(result.axes)
    if num_of_dimensions == 1:
        casted_result = cast_series(result, QFSeries)
    elif num_of_dimensions == 2:
        casted_result = cast_dataframe(result, QFDataFrame)
    else:
        casted_result = result

    return casted_result


[docs]def tickers_dict_to_data_array(tickers_data_dict: Dict[Ticker, QFDataFrame],
                               requested_tickers: Union[Ticker, Sequence[Ticker]],
                               requested_fields: Union[Any, Sequence[Any]]) -> QFDataArray:
    """
    Converts a dictionary mapping tickers to DateFrame onto a QFDataArray,
    by applying a filter on the tickers and fields that are needed

    Parameters
    ----------
    tickers_data_dict:  Dict[Ticker, QFDataFrame]
        Ticker -> QFDataFrame[dates, fields]
    requested_tickers: Sequence[Ticker]
        Filter the data dict based on a list of tickers
    requested_fields
        Filter the data dict based on a list of fields
    Returns
    -------
    QFDataArray
    """
    # return empty xr.DataArray if there is no data to be converted
    requested_tickers, _ = convert_to_list(requested_tickers, Ticker)
    if not isinstance(requested_fields, Sequence) or isinstance(requested_fields, str):
        requested_fields, _ = convert_to_list(requested_fields, type(requested_fields))

    if not tickers_data_dict:
        return QFDataArray.create(dates=[], tickers=requested_tickers, fields=requested_fields)

    tickers = []
    data_arrays = []
    for ticker, df in tickers_data_dict.items():
        df.index.name = DATES
        if df.empty:  # if there is no data for a given ticker, skip it (proper column will be added afterwards anyway)
            continue

        data_array = df.to_xarray()
        data_array = data_array.to_array(dim=FIELDS, name=ticker)
        data_array = data_array.transpose(DATES, FIELDS)

        tickers.append(ticker)
        data_arrays.append(data_array)

    if not data_arrays:
        return QFDataArray.create(dates=[], tickers=requested_tickers, fields=requested_fields)

    tickers_index = pd.Index(tickers, name=TICKERS)
    result = QFDataArray.concat(data_arrays, dim=tickers_index)
    result = result.reindex(tickers=requested_tickers, fields=requested_fields)

    # the DataArray gets a name after the first ticker in the tickers_data_dict.keys() which is incorrect;
    # it should have no name
    result.name = None

    return result


def get_fields_from_tickers_data_dict(tickers_data_dict):
    fields = set()
    for dates_fields_df in tickers_data_dict.values():
        fields.update(dates_fields_df.columns.values)

    fields = list(fields)
    return fields


[docs]def chain_tickers_within_range(future_ticker: FutureTicker, exp_dates: QFDataFrame, start_date: datetime,
                               end_date: datetime):
    """
    Returns only these tickers belonging to the chain of a given FutureTicker, which were valid only for the given
    time frame.

    As it is possible to select the contracts to be traded for a given future ticker (e.g. for Bloomberg
    future tickers we could specify only to trade "M" contracts), the end date is computed as the original end date
    + 1 year x contract number to trade.
    E.g. if we specify that we only want to trade "M" contracts and we always want to trade the front M contract,
    we add 1 year x 1. If instead of the front M, we would like to trade the second next M contract,
    we add 2 years to the end date etc.
    """
    exp_dates = exp_dates[exp_dates >= start_date].dropna()
    exp_dates = exp_dates[exp_dates <= end_date + RelativeDelta(years=future_ticker.N)].dropna()
    return exp_dates.index.tolist()