Skip to content
This repository has been archived by the owner on Aug 21, 2020. It is now read-only.

trims VOiCES data #82

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions cyphercat/datadefs/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from torch.utils.data.dataset import Subset

import pandas as pd
import numpy as np


def dataset_split(dataset=None, lengths=None, indices=None):
Expand Down Expand Up @@ -125,7 +126,7 @@ def splitter(dfs={}, df=None, unique_categories=[], category_id='', splits=[],


def splitter2(dfs={}, df=None, unique_categories=[], category_id='', splits=[],
N=-1, split_by_class=False):
N=-1, split_by_class=False, trim=True):
""" Splits the data for given unqie categories according to specified
fractions.

Expand All @@ -151,6 +152,10 @@ def splitter2(dfs={}, df=None, unique_categories=[], category_id='', splits=[],
Todo:
- Add example.
"""
if trim:
mics = [1, 4, 5, 8, 9, 11]
else:
mics = np.arange(1, 13)
# N is to keep track of the dataframe dict keys
n_splits = len(splits)

Expand All @@ -177,8 +182,10 @@ def splitter2(dfs={}, df=None, unique_categories=[], category_id='', splits=[],
if i_cat == 0:
dfs[idx + N] = df[df['speaker_id'] == category]
else:
dfs[idx + N] = dfs[idx + N].append(df[df['speaker_id'] ==
category])
trunc1 = df[df['speaker_id'] == category]
trunc2 = trunc1[trunc1['Mic'].isin(mics)]

dfs[idx + N] = dfs[idx + N].append(trunc2)
start_category += n_categories
for idx in range(n_splits):
dfs[idx + N] = dfs[idx + N].reset_index()
Expand All @@ -187,6 +194,9 @@ def splitter2(dfs={}, df=None, unique_categories=[], category_id='', splits=[],
for category in unique_categories: # for each category

mini_df = df[df[category_id] == category]
# Trim to just half the mics (half recordings)
mini_df = mini_df[mini_df['Mic'].isin(mics)]

mini_df = mini_df.reset_index()

# Identify segments:
Expand Down
62 changes: 42 additions & 20 deletions cyphercat/datadefs/voices_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ def load_or_index_subset(subset=None, path=None, fragment_seconds=3,
return df


def Voices_preload_and_split(subset='room-1', seconds=3,
path=None, pad=False, splits=None):
def Voices_preload_and_split(subset='room-1', test_subset='room-2', seconds=3,
path=None, pad=False, splits=None, trim=True):
"""Index and split librispeech dataset.

Args:
Expand All @@ -117,6 +117,7 @@ def Voices_preload_and_split(subset='room-1', seconds=3,
samples with lenght below the minimum.
splits (dict): dictionary with {name:[fractions]} for a user specified
split. The split will be saved to 'DATASPLITS_DIR' under 'name'
trim (bool): trims data by >.5. removes half of mics, and no noise data

Returns:
dict(Dataframes): Dictionary containing the dataframes corresponding
Expand All @@ -139,6 +140,14 @@ def Voices_preload_and_split(subset='room-1', seconds=3,
' and subset = {}'.format(seconds, subset))
df = load_or_index_subset(subset=subset, path=path,
fragment_seconds=fragment_seconds, pad=pad)

test_df = load_or_index_subset(subset=test_subset, path=path,
fragment_seconds=fragment_seconds, pad=pad)

# remove all None sound from df
if trim:
df = df[df['Noise'] != 'none']

# Convert arbitrary integer labels of dataset to ordered 0-(num_speakers
# - 1) labels
unique_speakers = sorted(df['speaker_id'].unique())
Expand Down Expand Up @@ -172,7 +181,8 @@ def Voices_preload_and_split(subset='room-1', seconds=3,
# VOiCES is parsed by developers (not users), so will include
# a warning
print('WARNING: Creating default speaker splits for VOiCES!')
dfs = default_speaker_splitter2(dfs, df)
dfs = default_speaker_splitter2(
dfs, df, trim=trim, test_df=test_df)
# write the default dataframes
for i_df, this_df in enumerate(dfs):
dfs[this_df] = dfs[this_df].drop(columns=['id'])
Expand Down Expand Up @@ -203,7 +213,7 @@ def Voices_preload_and_split(subset='room-1', seconds=3,
# LibriSpeech is parsed by developers (not users), so will include
# a warning
print('WARNING: Creating default sample splits for VOiCES!')
sample_dfs = default_sample_splitter(sample_dfs, df)
sample_dfs = default_sample_splitter(sample_dfs, df, trim)
# write the default dataframes
for i_df, this_df in enumerate(sample_dfs):
sample_dfs[this_df] = sample_dfs[this_df].drop(columns=['id'])
Expand Down Expand Up @@ -235,6 +245,7 @@ def Voices_preload_and_split(subset='room-1', seconds=3,

dfs = splitter(dfs=dfs, df=df, unique_categories=unique_speakers1,
category_id='speaker_id', splits=splits, N=0)

dfs = splitter(dfs=dfs, df=df, unique_categories=unique_speakers2,
category_id='speaker_id', splits=splits, N=2)

Expand Down Expand Up @@ -326,7 +337,7 @@ def index_subset(path=None, subset=None):
return audio_files


def default_speaker_splitter(dfs=None, df=None):
def default_speaker_splitter(dfs=None, df=None, trim=True):
""" Performs cycpercat default split for librspeech dataset.

Args:
Expand Down Expand Up @@ -358,22 +369,22 @@ def default_speaker_splitter(dfs=None, df=None):
# splits speakers in 0.8/0.2 split for target
m_dfs = splitter2(dfs=m_dfs, df=male_df,
unique_categories=unique_male[:n_male],
category_id=cat_id, splits=[0.8, 0.2], N=0)
category_id=cat_id, splits=[0.8, 0.2], N=0, trim=trim)
# splits by speaker for attack
m_dfs = splitter2(dfs=m_dfs, df=male_df,
unique_categories=unique_male[n_male:],
category_id=cat_id, splits=[0.5, 0.5],
N=2, split_by_class=True)
N=2, split_by_class=True, trim=trim)
m_dfs[4] = m_dfs[0][:len(m_dfs[1])]
# female splits
f_dfs = {}
f_dfs = splitter2(dfs=f_dfs, df=female_df,
unique_categories=unique_female[:n_female],
category_id=cat_id, splits=[0.8, 0.2], N=0)
category_id=cat_id, splits=[0.8, 0.2], N=0, trim=trim)
f_dfs = splitter2(dfs=f_dfs, df=female_df,
unique_categories=unique_female[n_female:],
category_id=cat_id, splits=[0.5, 0.5], N=2,
split_by_class=True)
split_by_class=True, trim=trim)
f_dfs[4] = f_dfs[0][:len(f_dfs[1])]
# merge male and female into final splits
for i_split in range(5):
Expand All @@ -384,7 +395,7 @@ def default_speaker_splitter(dfs=None, df=None):
return dfs


def default_speaker_splitter2(dfs=None, df=None):
def default_speaker_splitter2(dfs=None, df=None, trim=False, test_df=None):
""" Performs cycpercat default split for librspeech dataset.

Args:
Expand Down Expand Up @@ -419,44 +430,55 @@ def default_speaker_splitter2(dfs=None, df=None):
# splits speakers in 0.8/0.2 split for target
m_dfs = splitter2(dfs=m_dfs, df=male_df,
unique_categories=unique_male[:n_male*n1],
category_id=cat_id, splits=[0.8, 0.2], N=0)
category_id=cat_id, splits=[0.8, 0.2], N=0, trim=trim)
# splits by speaker for attack
m_dfs = splitter2(dfs=m_dfs, df=male_df,
unique_categories=unique_male[n_male*n1:n_male*n2],
category_id=cat_id, splits=[0.5, 0.5],
N=2, split_by_class=True)
N=2, split_by_class=True, trim=trim)
# split off unheard speakers for outset
m_dfs = splitter2(dfs=m_dfs, df=male_df,
unique_categories=unique_male[n_male*n2:],
category_id=cat_id, splits=[0, 1],
N=4, split_by_class=True)
N=4, split_by_class=True, trim=trim)
# Replace in set with subset of df0
m_dfs[4] = m_dfs[0][:len(m_dfs[1])]
# female splits
f_dfs = {}
f_dfs = splitter2(dfs=f_dfs, df=female_df,
unique_categories=unique_female[:n_female*n1],
category_id=cat_id, splits=[0.8, 0.2], N=0)
category_id=cat_id, splits=[0.8, 0.2], N=0, trim=trim)
f_dfs = splitter2(dfs=f_dfs, df=female_df,
unique_categories=unique_female[n_female*n1:n_female*n2],
category_id=cat_id, splits=[0.5, 0.5], N=2,
split_by_class=True)
split_by_class=True, trim=trim)
f_dfs = splitter2(dfs=f_dfs, df=female_df,
unique_categories=unique_female[n_female*n2:],
category_id=cat_id, splits=[0, 1], N=4,
split_by_class=True)
split_by_class=True, trim=trim)
f_dfs[4] = f_dfs[0][:len(f_dfs[1])]
# merge male and female into final splits
for i_split in range(6):
print('Merging split %i\n Male: %i and Female: %i' %
(i_split, len(m_dfs[i_split]), len(f_dfs[i_split])))
dfs[i_split] = m_dfs[i_split].append(f_dfs[i_split])

# make dfs[1] identical data to train room, but from test room

# get identifiers for the room 2 files we need
fids = [f[-50:] for f in dfs[1].filepath] # correct files
print(len(fids), ' original files')
fids2 = [f[-50:] for f in test_df.filepath] # files to check

bools = [elem in fids for elem in fids2]
print(sum(bools), ' matched files from room 2')
dfs[1] = test_df[bools]

return dfs


def default_sample_splitter(dfs=None, df=None):
""" Performs cycpercat default split for librspeech dataset.
def default_sample_splitter(dfs=None, df=None, trim=True):
""" Performs cyphercat default split for librspeech dataset.

Args:
dfs (dict(Dataframe)): Current dictionary of dataframes.
Expand Down Expand Up @@ -486,10 +508,10 @@ def default_sample_splitter(dfs=None, df=None):
m_dfs = {}
m_dfs = splitter2(dfs=m_dfs, df=male_df,
unique_categories=unique_male[:n_male],
category_id=cat_id, splits=[0.8, 0.2], N=0)
category_id=cat_id, splits=[0.8, 0.2], N=0, trim=trim)
m_dfs = splitter2(dfs=m_dfs, df=male_df,
unique_categories=unique_male[n_male:],
category_id=cat_id, splits=[0.5, 0.5], N=2)
category_id=cat_id, splits=[0.5, 0.5], N=2, trim=trim)
m_dfs[4] = m_dfs[0][:len(m_dfs[1])]
# female splits
f_dfs = {}
Expand Down
3 changes: 2 additions & 1 deletion cyphercat/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
REPO_DIR = os.path.split(CYCAT_DIR)[0]

# Local directory for datasets
DATASETS_DIR = os.path.join(REPO_DIR, 'Datasets')
DATASETS_DIR = os.path.join(
REPO_DIR, '/../../../fs4/datasets/') # , 'Datasets')

# Local directory for datasets
DATASPLITS_DIR = os.path.join(DATASETS_DIR, 'splits')