Source code for mars.learn.utils.multiclass

# Copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections.abc import Sequence

import numpy as np
try:
    from scipy.sparse.base import spmatrix
except ImportError:  # pragma: no cover
    spmatrix = None

from ... import opcodes as OperandDef
from ... import tensor as mt
from ...core import ENTITY_TYPE, recursive_tile
from ...serialization.serializables import KeyField, BoolField, TupleField, DataTypeField, AnyField, ListField
from ...tensor.core import TensorOrder
from ..operands import LearnOperand, LearnOperandMixin, OutputType
from ..utils import assert_all_finite


class IsMultilabel(LearnOperand, LearnOperandMixin):
    _op_type_ = OperandDef.IS_MULTILABEL

    _y = AnyField('y')
    _unique_y = KeyField('unique_y')
    # for chunk
    _is_y_sparse = BoolField('is_y_sparse')

    def __init__(self, y=None, unique_y=None, is_y_sparse=None, **kw):
        super().__init__(_y=y, _unique_y=unique_y,
                         _is_y_sparse=is_y_sparse, **kw)
        self.output_types = [OutputType.tensor]

    @property
    def y(self):
        return self._y

    @property
    def unique_y(self):
        return self._unique_y

    @property
    def is_y_sparse(self):
        return self._is_y_sparse

    def _set_inputs(self, inputs):
        super()._set_inputs(inputs)
        if isinstance(self._y, ENTITY_TYPE):
            self._y = self._inputs[0]
        if self._unique_y is not None:
            self._unique_y = self._inputs[-1]

    def __call__(self, y, y_unique=None):
        inputs = [y] if isinstance(y, ENTITY_TYPE) else []
        if y_unique is not None:
            inputs.append(y_unique)
        return self.new_tileable(inputs, shape=(), dtype=np.dtype(bool),
                                 order=TensorOrder.C_ORDER)

    @classmethod
    def tile(cls, op):
        y = op.y
        out = op.outputs[0]

        if not (hasattr(y, 'shape') and y.ndim == 2 and y.shape[1] > 1):
            result = yield from recursive_tile(mt.array(False))
            return [result]
        else:
            unique_y = op.unique_y
            assert len(unique_y.chunks) == 1
            unique_y_chunk = unique_y.chunks[0]
            chunk_op = IsMultilabel(unique_y=unique_y_chunk,
                                    is_y_sparse=y.issparse())
            chunk = chunk_op.new_chunk([unique_y_chunk], dtype=out.dtype,
                                       order=out.order, index=(0,),
                                       shape=())

            new_op = op.copy()
            params = out.params
            params['nsplits'] = ()
            params['chunks'] = [chunk]
            return new_op.new_tileables(op.inputs, kws=[params])

    @classmethod
    def execute(cls, ctx, op):
        unique_y = ctx[op.unique_y.key]

        if op.is_y_sparse:
            # sparse
            result = (unique_y.size in (0, 1) and
                      (unique_y.dtype.kind in 'biu' or  # bool, int, uint
                       _is_integral_float(unique_y)))
        else:
            # dense
            labels = unique_y
            result = len(labels) < 3 and (unique_y.dtype.kind in 'biu' or  # bool, int, uint
                                          _is_integral_float(labels))

        ctx[op.outputs[0].key] = result


def _is_integral_float(y):
    return y.dtype.kind == 'f' and np.all(y.astype(int) == y)


[docs]def is_multilabel(y): """ Check if ``y`` is in a multilabel format. Parameters ---------- y : numpy array of shape [n_samples] Target values. Returns ------- out : bool, Return ``True``, if ``y`` is in a multilabel format, else ```False``. Examples -------- >>> import mars.tensor as mt >>> from mars.learn.utils.multiclass import is_multilabel >>> is_multilabel([0, 1, 0, 1]).execute() False >>> is_multilabel([[1], [0, 2], []]).execute() False >>> is_multilabel(mt.array([[1, 0], [0, 0]])).execute() True >>> is_multilabel(mt.array([[1], [0], [0]])).execute() False >>> is_multilabel(mt.array([[1, 0, 0]])).execute() True """ if not isinstance(y, ENTITY_TYPE): if hasattr(y, '__array__') or isinstance(y, Sequence): y = np.asarray(y) if hasattr(y, 'shape'): yt = y = mt.asarray(y) else: yt = None else: yt = y = mt.tensor(y) if hasattr(y, 'dtype') and y.dtype != np.object_: unique_y = mt.unique(y, aggregate_size=1) else: unique_y = None op = IsMultilabel(y=y, unique_y=unique_y) return op(yt, unique_y)
class TypeOfTarget(LearnOperand, LearnOperandMixin): __slots__ = ('_unique_y_chunk', '_check_all_finite_chunk') _op_type_ = OperandDef.TYPE_OF_TARGET _y = AnyField('y') # for chunks _is_multilabel = KeyField('is_multilabel') _first_value = KeyField('first_value') _check_float = KeyField('check_float') _assert_all_finite = KeyField('assert_all_finite') _unique_y = KeyField('unique_y') _y_shape = TupleField('y_shape') _y_dtype = DataTypeField('y_dtype') _checked_targets = ListField('checked_targets') def __init__(self, y=None, is_multilabel=None, first_value=None, check_float=None, assert_all_finite=None, unique_y=None, y_shape=None, y_dtype=None, checked_targets=None, **kw): super().__init__(_y=y, _is_multilabel=is_multilabel, _first_value=first_value, _check_float=check_float, _assert_all_finite=assert_all_finite, _unique_y=unique_y, _y_shape=y_shape, _y_dtype=y_dtype, _checked_targets=checked_targets, **kw) self.output_types = [OutputType.tensor] @property def y(self): return self._y @property def is_multilabel(self): return self._is_multilabel @property def first_value(self): return self._first_value @property def check_float(self): return self._check_float @property def assert_all_finite(self): return self._assert_all_finite @property def unique_y(self): return self._unique_y @property def y_shape(self): return self._y_shape @property def y_dtype(self): return self._y_dtype @property def checked_targets(self): return self._checked_targets def _set_inputs(self, inputs): super()._set_inputs(inputs) inputs_iter = iter(self._inputs) for attr in ['_y', '_is_multilabel', '_first_value', '_check_float', '_assert_all_finite', '_unique_y']: v = getattr(self, attr) if isinstance(v, ENTITY_TYPE): setattr(self, attr, next(inputs_iter)) def __call__(self, y): inputs = [y] if isinstance(y, ENTITY_TYPE) else [] return self.new_tileable(inputs, shape=(), order=TensorOrder.C_ORDER, dtype=np.dtype(object)) @classmethod def tile(cls, op): out = op.outputs[0] y = op.y chunk_inputs = [] is_multilabel_chunk = (yield from recursive_tile(is_multilabel(y))).chunks[0] chunk_inputs.append(is_multilabel_chunk) if not isinstance(y, ENTITY_TYPE): if hasattr(y, '__array__'): y = np.asarray(y) y = mt.asarray(y) if np.isnan(y.size): # pragma: no cover yield chunk_op = TypeOfTarget(is_multilabel=is_multilabel_chunk, y_shape=y.shape, y_dtype=y.dtype) if y.ndim <= 2 and y.size > 0 and y.dtype == object: first_value_chunk = (yield from recursive_tile(y[(0,) * y.ndim])).chunks[0] chunk_inputs.append(first_value_chunk) chunk_op._first_value = first_value_chunk if y.dtype.kind == 'f': check_float_chunk = (yield from recursive_tile(mt.any(y != y.astype(int)))).chunks[0] chunk_inputs.append(check_float_chunk) chunk_op._check_float = check_float_chunk assert_all_finite_chunk = (yield from recursive_tile(assert_all_finite(y))).chunks[0] chunk_inputs.append(assert_all_finite_chunk) chunk_op._assert_all_finite = assert_all_finite_chunk if y.size > 0: unique_y_chunk = (yield from recursive_tile(mt.unique(y, aggregate_size=1))).chunks[0] chunk_inputs.append(unique_y_chunk) chunk_op._unique_y = unique_y_chunk chunk = chunk_op.new_chunk(chunk_inputs, dtype=out.dtype, shape=out.shape, order=out.order, index=()) params = out.params params['nsplits'] = () params['chunks'] = [chunk] new_op = op.copy() return new_op.new_tileables(op.inputs, kws=[params]) @classmethod def _execute(cls, ctx, op): is_multilabel_ = ctx[op.is_multilabel.key] shape = op.y_shape ndim = len(shape) dtype = op.y_dtype if is_multilabel_: return 'multilabel-indicator' if ndim > 2 or (dtype == object and shape[0] and not isinstance(ctx[op.first_value.key], str)): return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] if ndim == 2 and shape[1] == 0: return 'unknown' # [[]] if ndim == 2 and shape[1] > 1: suffix = '-multioutput' # [[1, 2], [1, 2]] else: suffix = "" # [1, 2, 3] or [[1], [2], [3]] # check float and contains non-integer float values if dtype.kind == 'f' and ctx[op.check_float.key]: # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] assert ctx[op.assert_all_finite.key] return 'continuous' + suffix if op.unique_y is not None: unique_y_len = len(ctx[op.unique_y.key]) else: # y.size == 0 unique_y_len = 0 if (unique_y_len > 2) or (ndim >= 2 and shape[1] > 1): return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] else: return 'binary' # [1, 2] or [["a"], ["b"]] @classmethod def execute(cls, ctx, op): target = cls._execute(ctx, op) if op.checked_targets is not None and len(op.checked_targets) > 0: if target not in op.checked_targets: raise ValueError(f'Unknown label type: {target}') ctx[op.outputs[0].key] = target
[docs]def type_of_target(y): """Determine the type of data indicated by the target. Note that this type is the most specific type that can be inferred. For example: * ``binary`` is more specific but compatible with ``multiclass``. * ``multiclass`` of integers is more specific but compatible with ``continuous``. * ``multilabel-indicator`` is more specific but compatible with ``multiclass-multioutput``. Parameters ---------- y : array-like Returns ------- target_type : string One of: * 'continuous': `y` is an array-like of floats that are not all integers, and is 1d or a column vector. * 'continuous-multioutput': `y` is a 2d tensor of floats that are not all integers, and both dimensions are of size > 1. * 'binary': `y` contains <= 2 discrete values and is 1d or a column vector. * 'multiclass': `y` contains more than two discrete values, is not a sequence of sequences, and is 1d or a column vector. * 'multiclass-multioutput': `y` is a 2d tensor that contains more than two discrete values, is not a sequence of sequences, and both dimensions are of size > 1. * 'multilabel-indicator': `y` is a label indicator matrix, a tensor of two dimensions with at least two columns, and at most 2 unique values. * 'unknown': `y` is array-like but none of the above, such as a 3d tensor, sequence of sequences, or a tensor of non-sequence objects. Examples -------- >>> import mars.tensor as mt >>> from mars.learn.utils.multiclass import type_of_target >>> type_of_target([0.1, 0.6]).execute() 'continuous' >>> type_of_target([1, -1, -1, 1]).execute() 'binary' >>> type_of_target(['a', 'b', 'a']).execute() 'binary' >>> type_of_target([1.0, 2.0]).execute() 'binary' >>> type_of_target([1, 0, 2]).execute() 'multiclass' >>> type_of_target([1.0, 0.0, 3.0]).execute() 'multiclass' >>> type_of_target(['a', 'b', 'c']).execute() 'multiclass' >>> type_of_target(mt.array([[1, 2], [3, 1]])).execute() 'multiclass-multioutput' >>> type_of_target([[1, 2]]).execute() 'multiclass-multioutput' >>> type_of_target(mt.array([[1.5, 2.0], [3.0, 1.6]])).execute() 'continuous-multioutput' >>> type_of_target(mt.array([[0, 1], [1, 1]])).execute() 'multilabel-indicator' """ valid_types = (Sequence, spmatrix) if spmatrix is not None else (Sequence,) valid = ((isinstance(y, valid_types) or hasattr(y, '__array__') or hasattr(y, '__mars_tensor__')) and not isinstance(y, str)) if not valid: raise ValueError(f'Expected array-like (array or non-string sequence), got {y}') sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray']) if sparse_pandas: # pragma: no cover raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'") if isinstance(y, ENTITY_TYPE): y = mt.tensor(y) op = TypeOfTarget(y=y) return op(y)
def check_classification_targets(y): """Ensure that target y is of a non-regression type. Only the following target types (as defined in type_of_target) are allowed: 'binary', 'multiclass', 'multiclass-multioutput', 'multilabel-indicator', 'multilabel-sequences' Parameters ---------- y : array-like """ y_type = type_of_target(y) y_type.op._checked_targets = ['binary', 'multiclass', 'multiclass-multioutput', 'multilabel-indicator', 'multilabel-sequences'] return y_type