# Copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections.abc import Sequence
import numpy as np
try:
from scipy.sparse.base import spmatrix
except ImportError: # pragma: no cover
spmatrix = None
from ... import opcodes as OperandDef
from ... import tensor as mt
from ...core import ENTITY_TYPE, recursive_tile
from ...serialization.serializables import KeyField, BoolField, TupleField, DataTypeField, AnyField, ListField
from ...tensor.core import TensorOrder
from ..operands import LearnOperand, LearnOperandMixin, OutputType
from ..utils import assert_all_finite
class IsMultilabel(LearnOperand, LearnOperandMixin):
_op_type_ = OperandDef.IS_MULTILABEL
_y = AnyField('y')
_unique_y = KeyField('unique_y')
# for chunk
_is_y_sparse = BoolField('is_y_sparse')
def __init__(self, y=None, unique_y=None, is_y_sparse=None, **kw):
super().__init__(_y=y, _unique_y=unique_y,
_is_y_sparse=is_y_sparse, **kw)
self.output_types = [OutputType.tensor]
@property
def y(self):
return self._y
@property
def unique_y(self):
return self._unique_y
@property
def is_y_sparse(self):
return self._is_y_sparse
def _set_inputs(self, inputs):
super()._set_inputs(inputs)
if isinstance(self._y, ENTITY_TYPE):
self._y = self._inputs[0]
if self._unique_y is not None:
self._unique_y = self._inputs[-1]
def __call__(self, y, y_unique=None):
inputs = [y] if isinstance(y, ENTITY_TYPE) else []
if y_unique is not None:
inputs.append(y_unique)
return self.new_tileable(inputs, shape=(), dtype=np.dtype(bool),
order=TensorOrder.C_ORDER)
@classmethod
def tile(cls, op):
y = op.y
out = op.outputs[0]
if not (hasattr(y, 'shape') and y.ndim == 2 and y.shape[1] > 1):
result = yield from recursive_tile(mt.array(False))
return [result]
else:
unique_y = op.unique_y
assert len(unique_y.chunks) == 1
unique_y_chunk = unique_y.chunks[0]
chunk_op = IsMultilabel(unique_y=unique_y_chunk,
is_y_sparse=y.issparse())
chunk = chunk_op.new_chunk([unique_y_chunk], dtype=out.dtype,
order=out.order, index=(0,),
shape=())
new_op = op.copy()
params = out.params
params['nsplits'] = ()
params['chunks'] = [chunk]
return new_op.new_tileables(op.inputs, kws=[params])
@classmethod
def execute(cls, ctx, op):
unique_y = ctx[op.unique_y.key]
if op.is_y_sparse:
# sparse
result = (unique_y.size in (0, 1) and
(unique_y.dtype.kind in 'biu' or # bool, int, uint
_is_integral_float(unique_y)))
else:
# dense
labels = unique_y
result = len(labels) < 3 and (unique_y.dtype.kind in 'biu' or # bool, int, uint
_is_integral_float(labels))
ctx[op.outputs[0].key] = result
def _is_integral_float(y):
return y.dtype.kind == 'f' and np.all(y.astype(int) == y)
[docs]def is_multilabel(y):
""" Check if ``y`` is in a multilabel format.
Parameters
----------
y : numpy array of shape [n_samples]
Target values.
Returns
-------
out : bool,
Return ``True``, if ``y`` is in a multilabel format, else ```False``.
Examples
--------
>>> import mars.tensor as mt
>>> from mars.learn.utils.multiclass import is_multilabel
>>> is_multilabel([0, 1, 0, 1]).execute()
False
>>> is_multilabel([[1], [0, 2], []]).execute()
False
>>> is_multilabel(mt.array([[1, 0], [0, 0]])).execute()
True
>>> is_multilabel(mt.array([[1], [0], [0]])).execute()
False
>>> is_multilabel(mt.array([[1, 0, 0]])).execute()
True
"""
if not isinstance(y, ENTITY_TYPE):
if hasattr(y, '__array__') or isinstance(y, Sequence):
y = np.asarray(y)
if hasattr(y, 'shape'):
yt = y = mt.asarray(y)
else:
yt = None
else:
yt = y = mt.tensor(y)
if hasattr(y, 'dtype') and y.dtype != np.object_:
unique_y = mt.unique(y, aggregate_size=1)
else:
unique_y = None
op = IsMultilabel(y=y, unique_y=unique_y)
return op(yt, unique_y)
class TypeOfTarget(LearnOperand, LearnOperandMixin):
__slots__ = ('_unique_y_chunk', '_check_all_finite_chunk')
_op_type_ = OperandDef.TYPE_OF_TARGET
_y = AnyField('y')
# for chunks
_is_multilabel = KeyField('is_multilabel')
_first_value = KeyField('first_value')
_check_float = KeyField('check_float')
_assert_all_finite = KeyField('assert_all_finite')
_unique_y = KeyField('unique_y')
_y_shape = TupleField('y_shape')
_y_dtype = DataTypeField('y_dtype')
_checked_targets = ListField('checked_targets')
def __init__(self, y=None, is_multilabel=None, first_value=None,
check_float=None, assert_all_finite=None,
unique_y=None, y_shape=None, y_dtype=None,
checked_targets=None, **kw):
super().__init__(_y=y, _is_multilabel=is_multilabel,
_first_value=first_value, _check_float=check_float,
_assert_all_finite=assert_all_finite,
_unique_y=unique_y, _y_shape=y_shape,
_y_dtype=y_dtype, _checked_targets=checked_targets, **kw)
self.output_types = [OutputType.tensor]
@property
def y(self):
return self._y
@property
def is_multilabel(self):
return self._is_multilabel
@property
def first_value(self):
return self._first_value
@property
def check_float(self):
return self._check_float
@property
def assert_all_finite(self):
return self._assert_all_finite
@property
def unique_y(self):
return self._unique_y
@property
def y_shape(self):
return self._y_shape
@property
def y_dtype(self):
return self._y_dtype
@property
def checked_targets(self):
return self._checked_targets
def _set_inputs(self, inputs):
super()._set_inputs(inputs)
inputs_iter = iter(self._inputs)
for attr in ['_y', '_is_multilabel', '_first_value',
'_check_float', '_assert_all_finite',
'_unique_y']:
v = getattr(self, attr)
if isinstance(v, ENTITY_TYPE):
setattr(self, attr, next(inputs_iter))
def __call__(self, y):
inputs = [y] if isinstance(y, ENTITY_TYPE) else []
return self.new_tileable(inputs, shape=(), order=TensorOrder.C_ORDER,
dtype=np.dtype(object))
@classmethod
def tile(cls, op):
out = op.outputs[0]
y = op.y
chunk_inputs = []
is_multilabel_chunk = (yield from recursive_tile(is_multilabel(y))).chunks[0]
chunk_inputs.append(is_multilabel_chunk)
if not isinstance(y, ENTITY_TYPE):
if hasattr(y, '__array__'):
y = np.asarray(y)
y = mt.asarray(y)
if np.isnan(y.size): # pragma: no cover
yield
chunk_op = TypeOfTarget(is_multilabel=is_multilabel_chunk,
y_shape=y.shape, y_dtype=y.dtype)
if y.ndim <= 2 and y.size > 0 and y.dtype == object:
first_value_chunk = (yield from recursive_tile(y[(0,) * y.ndim])).chunks[0]
chunk_inputs.append(first_value_chunk)
chunk_op._first_value = first_value_chunk
if y.dtype.kind == 'f':
check_float_chunk = (yield from recursive_tile(mt.any(y != y.astype(int)))).chunks[0]
chunk_inputs.append(check_float_chunk)
chunk_op._check_float = check_float_chunk
assert_all_finite_chunk = (yield from recursive_tile(assert_all_finite(y))).chunks[0]
chunk_inputs.append(assert_all_finite_chunk)
chunk_op._assert_all_finite = assert_all_finite_chunk
if y.size > 0:
unique_y_chunk = (yield from recursive_tile(mt.unique(y, aggregate_size=1))).chunks[0]
chunk_inputs.append(unique_y_chunk)
chunk_op._unique_y = unique_y_chunk
chunk = chunk_op.new_chunk(chunk_inputs, dtype=out.dtype,
shape=out.shape, order=out.order, index=())
params = out.params
params['nsplits'] = ()
params['chunks'] = [chunk]
new_op = op.copy()
return new_op.new_tileables(op.inputs, kws=[params])
@classmethod
def _execute(cls, ctx, op):
is_multilabel_ = ctx[op.is_multilabel.key]
shape = op.y_shape
ndim = len(shape)
dtype = op.y_dtype
if is_multilabel_:
return 'multilabel-indicator'
if ndim > 2 or (dtype == object and shape[0] and
not isinstance(ctx[op.first_value.key], str)):
return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"]
if ndim == 2 and shape[1] == 0:
return 'unknown' # [[]]
if ndim == 2 and shape[1] > 1:
suffix = '-multioutput' # [[1, 2], [1, 2]]
else:
suffix = "" # [1, 2, 3] or [[1], [2], [3]]
# check float and contains non-integer float values
if dtype.kind == 'f' and ctx[op.check_float.key]:
# [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
assert ctx[op.assert_all_finite.key]
return 'continuous' + suffix
if op.unique_y is not None:
unique_y_len = len(ctx[op.unique_y.key])
else:
# y.size == 0
unique_y_len = 0
if (unique_y_len > 2) or (ndim >= 2 and shape[1] > 1):
return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
else:
return 'binary' # [1, 2] or [["a"], ["b"]]
@classmethod
def execute(cls, ctx, op):
target = cls._execute(ctx, op)
if op.checked_targets is not None and len(op.checked_targets) > 0:
if target not in op.checked_targets:
raise ValueError(f'Unknown label type: {target}')
ctx[op.outputs[0].key] = target
[docs]def type_of_target(y):
"""Determine the type of data indicated by the target.
Note that this type is the most specific type that can be inferred.
For example:
* ``binary`` is more specific but compatible with ``multiclass``.
* ``multiclass`` of integers is more specific but compatible with
``continuous``.
* ``multilabel-indicator`` is more specific but compatible with
``multiclass-multioutput``.
Parameters
----------
y : array-like
Returns
-------
target_type : string
One of:
* 'continuous': `y` is an array-like of floats that are not all
integers, and is 1d or a column vector.
* 'continuous-multioutput': `y` is a 2d tensor of floats that are
not all integers, and both dimensions are of size > 1.
* 'binary': `y` contains <= 2 discrete values and is 1d or a column
vector.
* 'multiclass': `y` contains more than two discrete values, is not a
sequence of sequences, and is 1d or a column vector.
* 'multiclass-multioutput': `y` is a 2d tensor that contains more
than two discrete values, is not a sequence of sequences, and both
dimensions are of size > 1.
* 'multilabel-indicator': `y` is a label indicator matrix, a tensor
of two dimensions with at least two columns, and at most 2 unique
values.
* 'unknown': `y` is array-like but none of the above, such as a 3d
tensor, sequence of sequences, or a tensor of non-sequence objects.
Examples
--------
>>> import mars.tensor as mt
>>> from mars.learn.utils.multiclass import type_of_target
>>> type_of_target([0.1, 0.6]).execute()
'continuous'
>>> type_of_target([1, -1, -1, 1]).execute()
'binary'
>>> type_of_target(['a', 'b', 'a']).execute()
'binary'
>>> type_of_target([1.0, 2.0]).execute()
'binary'
>>> type_of_target([1, 0, 2]).execute()
'multiclass'
>>> type_of_target([1.0, 0.0, 3.0]).execute()
'multiclass'
>>> type_of_target(['a', 'b', 'c']).execute()
'multiclass'
>>> type_of_target(mt.array([[1, 2], [3, 1]])).execute()
'multiclass-multioutput'
>>> type_of_target([[1, 2]]).execute()
'multiclass-multioutput'
>>> type_of_target(mt.array([[1.5, 2.0], [3.0, 1.6]])).execute()
'continuous-multioutput'
>>> type_of_target(mt.array([[0, 1], [1, 1]])).execute()
'multilabel-indicator'
"""
valid_types = (Sequence, spmatrix) if spmatrix is not None else (Sequence,)
valid = ((isinstance(y, valid_types) or
hasattr(y, '__array__') or hasattr(y, '__mars_tensor__'))
and not isinstance(y, str))
if not valid:
raise ValueError(f'Expected array-like (array or non-string sequence), got {y}')
sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray'])
if sparse_pandas: # pragma: no cover
raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
if isinstance(y, ENTITY_TYPE):
y = mt.tensor(y)
op = TypeOfTarget(y=y)
return op(y)
def check_classification_targets(y):
"""Ensure that target y is of a non-regression type.
Only the following target types (as defined in type_of_target) are allowed:
'binary', 'multiclass', 'multiclass-multioutput',
'multilabel-indicator', 'multilabel-sequences'
Parameters
----------
y : array-like
"""
y_type = type_of_target(y)
y_type.op._checked_targets = ['binary', 'multiclass', 'multiclass-multioutput',
'multilabel-indicator', 'multilabel-sequences']
return y_type