Source code for resdk.tables.ml_ready

""".. Ignore pydocstyle D400.

=============================
Machine learning ready tables
=============================

.. autoclass:: MLTables
    :members:

    .. automethod:: __init__

"""

import warnings
from io import BytesIO
from urllib.parse import urljoin

import pandas as pd


[docs]class MLTables: """Machine-learning ready tables.""" DATA_FIELDS = [ "id", "slug", "name", "modified", "output", ]
[docs] def __init__(self, collection, name): """Initialize class. :param collection: Collection to use """ self.resolwe = collection.resolwe self.collection = collection self.name = name
def _get_ref_space(self): """Get reference space Data with specified name.""" ref_spaces = self.resolwe.data.filter( type="data:ml:space", status="OK", fields=["id", "name"], collection__slug="reference-spaces", name=self.name, ) if ref_spaces.count() == 0: raise ValueError(f"No Reference space with name {self.name}.") elif ref_spaces.count() > 1: raise ValueError(f"Multiple Reference spaces with name {self.name}.") return ref_spaces[0] def _get_datum(self): """Get ML ready expressions Data object.""" ref_space = self._get_ref_space() # Get ID's of ref_space children children_ids = [ item["id"] for item in self.resolwe.api.data(ref_space.id).children.get(fields="id") ] data = self.collection.data.filter( type="data:ml:table:expressions", id__in=children_ids, status="OK", fields=self.DATA_FIELDS, ) if data.count() == 0: raise ValueError(f"No ML-ready data in collection {self.collection.name}.") elif data.count() > 1: warnings.warn( f"Multiple ML-ready data in collection {self.collection.name}. " "Using the latest one." ) return data[0] @property def exp(self): """ Get ML ready expressions as pandas.DataFrame. These expressions are normalized and batch effect corrected - thus ready to be taken into ML procedures. """ datum = self._get_datum() url = urljoin( self.resolwe.url, f"data/{datum.id}/{datum.output['exp']['file']}" ) response = self.resolwe.session.get(url) response.raise_for_status() with BytesIO() as f: f.write(response.content) f.seek(0) df = pd.read_csv(f, sep="\t", index_col=0) return df