Skip to content

Commit

Permalink
preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
zomux committed Nov 17, 2016
1 parent 48255b0 commit bf2b0cb
Show file tree
Hide file tree
Showing 15 changed files with 136 additions and 42 deletions.
4 changes: 3 additions & 1 deletion deepy/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@
from seq_mini_batch import SequentialMiniBatches
from binarized_mnist import BinarizedMnistDataset
from bunch_seq import BunchSequences
from ondisk_dataset import OnDiskDataset
from ondisk_dataset import OnDiskDataset
from data_processor import DataProcessor
from padding import pad_dataset
10 changes: 10 additions & 0 deletions deepy/dataset/data_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-


class DataProcessor(object):
"""
An abstract class for data processor.
"""
def process(self, split, epoch, dataset):
return dataset
51 changes: 24 additions & 27 deletions deepy/dataset/ondisk_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import types
from . import Dataset
from data_processor import DataProcessor
from deepy.utils import FakeGenerator, StreamPickler, global_rand

import logging as loggers
Expand All @@ -19,7 +20,10 @@ class OnDiskDataset(Dataset):
"""

def __init__(self, train_path, valid_path=None, test_path=None, train_size=None,
cached=False, post_processing=None, shuffle_memory=False, curriculum=None):
cached=False, post_processing=None, shuffle_memory=False, data_processor=None):
"""
:type data_processor: DataProcessor
"""
self._train_path = train_path
self._valid_path = valid_path
self._test_path = test_path
Expand All @@ -28,48 +32,41 @@ def __init__(self, train_path, valid_path=None, test_path=None, train_size=None,
self._cached_train_data = None
self._post_processing = post_processing if post_processing else lambda x: x
self._shuffle_memory = shuffle_memory
self._curriculum = curriculum
self._curriculum_count = 0
if curriculum and not callable(curriculum):
raise Exception("curriculum function must be callable")
if curriculum and not cached:
raise Exception("curriculum learning needs training data to be cached")
self._epoch = 0
self._data_processor = data_processor
if data_processor and not isinstance(data_processor, DataProcessor):
raise Exception("data_processor must be an instance of DataProcessor.")
if self._cache_on_memory:
logging.info("Cache on memory")
self._cached_train_data = list(map(self._post_processing, StreamPickler.load(open(self._train_path))))
self._train_size = len(self._cached_train_data)
# if self._shuffle_memory:
# logging.info("Shuffle on-memory data")
# global_rand.shuffle(self._cached_train_data)
if self._shuffle_memory:
logging.info("Shuffle on-memory data")
global_rand.shuffle(self._cached_train_data)

def curriculum_train_data(self):
self._curriculum_count += 1
logging.info("curriculum learning: round {}".format(self._curriculum_count))
return self._curriculum(self._cached_train_data, self._curriculum_count)
def _process_data(self, split, epoch, dataset):
if self._data_processor:
return self._data_processor.process(split, epoch, dataset)
else:
return dataset

def generate_train_data(self):
for data in StreamPickler.load(open(self._train_path)):
self._epoch += 1
data_source = self._cached_train_data if self._cache_on_memory else StreamPickler.load(open(self._train_path))
for data in self._process_data('train', self._epoch, data_source):
yield self._post_processing(data)

def generate_valid_data(self):
for data in StreamPickler.load(open(self._valid_path)):
data_source = StreamPickler.load(open(self._valid_path))
for data in self._process_data('valid', self._epoch, data_source):
yield self._post_processing(data)

def generate_test_data(self):
for data in StreamPickler.load(open(self._test_path)):
data_source = StreamPickler.load(open(self._test_path))
for data in self._process_data('test', self._epoch, data_source):
yield self._post_processing(data)

def train_set(self):
if self._cache_on_memory:
if self._shuffle_memory:
logging.info("shuffle on-memory data")
global_rand.shuffle(self._cached_train_data)
if self._curriculum:
if not isinstance(self._curriculum(self._cached_train_data, 1), types.GeneratorType):
raise Exception("Curriculum function must be a generator.")
return FakeGenerator(self, "curriculum_train_data")
else:
return self._cached_train_data
if not self._train_path:
return None
return FakeGenerator(self, "generate_train_data")
Expand Down
2 changes: 1 addition & 1 deletion deepy/dataset/padding.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
from deepy.utils import FLOATX

def pad_dataset(subset, side, length):
def pad_dataset(subset, side="right", length=-1):
"""
Pad data set to specified length.
Parameters:
Expand Down
8 changes: 8 additions & 0 deletions deepy/layers/block.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ def compute_tensor(self, x):
def compute_test_tesnor(self, x):
return x

def load_params(self, path, exclude_free_params=False):
"""
Load parameters to the block.
"""
from deepy.networks.comp_graph import ComputationalGraph
model = ComputationalGraph(blocks=[self])
model.load_params(path, exclude_free_params=exclude_free_params)

@property
def all_parameters(self):
return self.parameters
5 changes: 4 additions & 1 deletion deepy/layers/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ def concatenate(vars, axis=-1):
"""
A utility function of concatenate.
"""
return Concatenate(axis=axis).compute(*vars)
concat_var = Concatenate(axis=axis).compute(*vars)
if axis == -1 or axis == vars[0].tensor.ndim - 1:
concat_var.output_dim = sum([x.output_dim for x in vars], 0)
return concat_var

@neural_computation
def ifelse(condition, then_branch, else_branch):
Expand Down
4 changes: 2 additions & 2 deletions deepy/layers/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,10 @@ def compute(self, *inputs, **kwargs):
output = self.compute_tensor(*[t.tensor for t in inputs], **train_kwargs)
test_output = self.compute_test_tesnor(*[t.test_tensor for t in inputs], **test_kwargs)

if type(output) != list:
if type(output) != list and type(output) != tuple:
return NeuralVariable(output, test_output, self.output_dim)
else:
return [NeuralVariable(*item) for item in zip(self.output_dims, output, test_output)]
return [NeuralVariable(*item) for item in zip(output, test_output, self.output_dims)]

def prepare(self):
"""
Expand Down
16 changes: 16 additions & 0 deletions deepy/layers/recurrent.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,22 @@ def merge_inputs(self, input_var, additional_inputs=None):
def prepare(self):
pass

@neural_computation
def compute_step(self, state, lstm_cell=None, input=None, additional_inputs=None):
"""
Compute one step in the RNN.
:return: one variable for RNN and GRU, multiple variables for LSTM
"""
input_map = self.merge_inputs(input, additional_inputs=additional_inputs)
input_map.update({"state": state, "lstm_cell": lstm_cell})
output_map = self.compute_new_state(input_map)
outputs = [output_map.pop("state")]
outputs += output_map.values()
if len(outputs) == 1:
return outputs[0]
else:
return outputs

@neural_computation
def get_initial_states(self, input_var):
"""
Expand Down
5 changes: 4 additions & 1 deletion deepy/networks/comp_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class ComputationalGraph(NeuralNetwork):
"""

def __init__(self, input_dim=0, model=None, input_tensor=None, monitors=None,
cost=None, output=None, outputs=None, blocks=None, input_vars=None, target_vars=None):
cost=None, output=None, outputs=None, blocks=None, input_vars=None, target_vars=None, output_map=None):
"""
Create a basic network.
Expand Down Expand Up @@ -41,6 +41,9 @@ def __init__(self, input_dim=0, model=None, input_tensor=None, monitors=None,
if not output and not cost:
self._test_output = None
self._test_outputs = [o.test_tensor for o in outputs]

self.output_map = output_map if output_map else {}

if monitors:
if type(monitors) == dict:
monitors = monitors.items()
Expand Down
4 changes: 4 additions & 0 deletions deepy/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from padding import pad_sequence
26 changes: 26 additions & 0 deletions deepy/preprocessing/padding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
from itertools import izip, izip_longest

def pad_sequence(batch, pad_value=0, output_mask=True, length=None):
if length:
max_len = length
else:
max_len = max(map(len, batch))
mask = None
if output_mask:
mask = []
for i in range(len(batch)):
mask.append([1] * len(batch[i]) + [0] * (max_len - len(batch[i])))
mask = np.array(mask, dtype="float32")
if length:
new_batch = []
for i in range(len(batch)):
new_row = list(batch[i]) + [pad_value] * (max_len - len(batch[i]))
new_batch.append(new_row)
new_batch = np.array(new_batch)
else:
new_batch = np.array(list(izip(*izip_longest(*batch, fillvalue=pad_value))))
return new_batch, mask
7 changes: 4 additions & 3 deletions deepy/trainers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def _run_train(self, epoch, train_set, train_size=None):
self.last_run_costs = costs
return costs

def _run_valid(self, epoch, valid_set, dry_run=False):
def _run_valid(self, epoch, valid_set, dry_run=False, save_path=None):
"""
Run one valid iteration, return true if to continue training.
"""
Expand All @@ -246,9 +246,10 @@ def _run_valid(self, epoch, valid_set, dry_run=False):
self.best_cost = J
self.best_epoch = epoch

if self.config.auto_save and self._skip_batches == 0:
save_path = save_path if save_path else self.config.auto_save
if save_path and self._skip_batches == 0:
self.network.train_logger.record_progress(self._progress)
self.network.save_params(self.config.auto_save, new_thread=True)
self.network.save_params(save_path, new_thread=True)

info = ' '.join('%s=%.2f' % el for el in costs)
epoch_str = "epoch=%d" % (epoch + 1)
Expand Down
3 changes: 2 additions & 1 deletion deepy/utils/activations.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def compose(a, b):
'theano_softmax': T.nnet.softmax,

# shorthands
'relu': lambda z: z * (z > 0),
'relu': lambda z: T.nnet.relu(z),
'leaky_relu': lambda z: T.nnet.relu(z, 0.01),
'trel': lambda z: z * (z > 0) * (z < 1),
'trec': lambda z: z * (z > 1),
'tlin': lambda z: z * (abs(z) > 1),
Expand Down
27 changes: 22 additions & 5 deletions deepy/utils/decorations.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from theano.tensor.var import TensorVariable

def convert_to_theano_var(obj):
"""
Convert neural vars to theano vars.
:param obj: NeuralVariable or list or dict or tuple
:return: theano var, test var, tensor found, neural var found
"""
from theano.tensor.var import TensorVariable
from deepy.layers.neural_var import NeuralVariable
if type(obj) == tuple:
return tuple(convert_to_theano_var(list(obj)))
Expand Down Expand Up @@ -39,6 +40,18 @@ def convert_to_theano_var(obj):
return obj.tensor, obj.test_tensor, False, True
elif type(obj) == TensorVariable:
return obj, obj, True, False
elif type(obj) == slice:
normal_args = []
test_args = []
theano_var_found = False
neural_var_found = False
for arg in [obj.start, obj.stop, obj.step]:
normal_var, test_var, tensor_found, neural_found = convert_to_theano_var(arg)
normal_args.append(normal_var)
test_args.append(test_var)
if tensor_found: theano_var_found = True
if neural_found: neural_var_found = True
return slice(*normal_args), slice(*test_args), theano_var_found, neural_var_found
else:
return obj, obj, False, False

Expand Down Expand Up @@ -74,7 +87,6 @@ def neural_computation(original_func, prefer_tensor=False):
"""

def wrapper(*args, **kwargs):

normal_args, test_args, tensor_found_in_args, neural_found_in_args = convert_to_theano_var(args)
normal_kwargs, test_kwargs, tensor_found_in_kwargs, neural_found_in_kwargs = convert_to_theano_var(kwargs)

Expand All @@ -90,10 +102,15 @@ def wrapper(*args, **kwargs):
# No neural variables are inputted, so output tensors
return normal_result
else:
# Output neural variables
# Output neural variables, auto set output_dim
test_result = original_func(*test_args, **test_kwargs)
return convert_to_neural_var(normal_result, test_result)

result_var = convert_to_neural_var(normal_result, test_result)
if (isinstance(normal_result, TensorVariable) and
hasattr(normal_result.tag, "test_value") and
hasattr(normal_result.tag.test_value, "shape") and
normal_result.tag.test_value.shape):
result_var.output_dim = normal_result.tag.test_value.shape[-1]
return result_var
return wrapper

def neural_computation_prefer_tensor(original_func):
Expand Down
6 changes: 6 additions & 0 deletions deepy/utils/neural_tensor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import theano.tensor as T
from theano import tensor as theano_tensor
from decorations import neural_computation
from deepy.layers.neural_var import NeuralVariable

class NeuralTensorNet(object):

Expand All @@ -17,8 +19,12 @@ def wrapper(*args, **kwargs):
class NeuralTensor(object):
"""
A class for exporting Theano tensor operations to neural variables.
"""

def constant(self, value, dtype="float32", dim=None):
return NeuralVariable(T.constant(value, dtype=dtype), dim=dim)

def __getattr__(self, func_name):
global deepy_nnet
@neural_computation
Expand Down

0 comments on commit bf2b0cb

Please sign in to comment.