Skip to content
60 changes: 60 additions & 0 deletions elasticdl_preprocessing/layers/to_sparse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import tensorflow as tf


class ToSparse(tf.keras.layers.Layer):
"""Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.
If the input is already a `SparseTensor`, just return it.

Example :
```python
layer = ToSparse()
inp = tf.constant([["A", ""], ["B", "C"]], tf.string)
layer.call(inp)
tf.SparseTensor(
indices=np.array([[0, 0], [1, 0], [1, 1]]),
values=np.array(["A", "B", "C"]),
dense_shape=(2, 2),
)
```

Arguments:
ignore_value: Entries in inputs equal to this value will be
absent from the output `SparseTensor`. If `None`, default value of
inputs dtype will be used ('' for `str`, -1 for `int`).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we expose this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-1 for int seems dangerous as this is application specific

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-1 for int is the default value and the layer will ignore it during transformation.

Copy link
Collaborator

@brightcoder01 brightcoder01 Mar 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-1 for int seems dangerous as this is application specific

Use -1 as the default ignore_value for int type is also the implementation inside feature column. Please check the code snippet.


Input shape: A numeric or string `Tensor` of shape
`[batch_size, d1, ..., dm]`

Output shape: An `SparseTensor` with the same shape as inputs
"""

def __init__(self, ignore_value=None):
super(ToSparse, self).__init__()
self.ignore_value = ignore_value

def call(self, inputs):
if isinstance(inputs, tf.SparseTensor):
return inputs
if self.ignore_value is None:
if inputs.dtype == tf.string:
print("string")
self.ignore_value = ""
elif inputs.dtype.is_integer:
self.ignore_value = -1
self.ignore_value = tf.cast(self.ignore_value, inputs.dtype)
indices = tf.where(tf.not_equal(inputs, self.ignore_value))
values = tf.gather_nd(inputs, indices)
dense_shape = tf.shape(inputs, out_type=tf.int64)
return tf.SparseTensor(
indices=indices, values=values, dense_shape=dense_shape
)

def compute_output_shape(self, input_shape):
return input_shape

def get_config(self):
config = {
"ignore_value": self.ignore_value,
}
base_config = super(ToSparse, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
34 changes: 34 additions & 0 deletions elasticdl_preprocessing/tests/to_sparse_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import unittest

import numpy as np
import tensorflow as tf

from elasticdl_preprocessing.layers.to_sparse import ToSparse
from elasticdl_preprocessing.tests.test_utils import sparse_tensor_equal


class ToSparseTest(unittest.TestCase):
def test_to_sparse(self):
layer = ToSparse()
inp = tf.constant([["A", ""], ["B", "C"]], tf.string)
output = layer.call(inp)
expected_out = tf.SparseTensor(
indices=np.array([[0, 0], [1, 0], [1, 1]]),
values=np.array(["A", "B", "C"]),
dense_shape=(2, 2),
)
self.assertTrue(sparse_tensor_equal(output, expected_out))

layer = ToSparse()
inp = tf.constant([[12, -1], [45, 78]], tf.int64)
output = layer.call(inp)
expected_out = tf.SparseTensor(
indices=np.array([[0, 0], [1, 0], [1, 1]]),
values=np.array([12, 45, 78]),
dense_shape=(2, 2),
)
self.assertTrue(sparse_tensor_equal(output, expected_out))


if __name__ == "__main__":
unittest.main()