-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Solomon work #18
Comments
VTC is running for solomon on oberon and it should take about a week.. |
Release held jobs once tsimane2018 started:
|
Current result : Generated with the following script : from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
import os
import pandas as pd
import glob
import pympi
project = ChildProject('.')
am = AnnotationManager(project)
files = pd.DataFrame([
{'raw_filename': os.path.join('solis', os.path.basename(f))} for f in glob.glob('raw_annotations/solis/*.eaf')
])
extract = files['raw_filename'].str.extract(r"([0-9]+_(?:CW[0-9]+|NA)_CH[0-9]+_(?:AJ|FB|LM)[0-9]+_(?:AJ|FB|LM)[0-9]+_[0-9]{6})_([0-9])?(?:.*)(?:\-|_)([A-Z]{2})")
files['recording_filename'] = extract[0] + '.WAV'
files['part'] = extract[1]
files['author'] = extract[2]
files = files.merge(project.recordings, how = 'inner', left_on = 'recording_filename', right_on = 'filename')
files['part'] = files['part'].fillna(1)
files['time_seek'] = (files['part'].astype(int)-1)*15*3600
files['set'] = 'solis_eaf_' + files['author']
ranges = []
for f in files.to_dict(orient = 'records'):
eaf = pympi.Elan.Eaf(os.path.join('raw_annotations', f['raw_filename']))
portions = eaf.tiers['code_periodic'][0]
for pid in portions:
(start_ts, end_ts, value, svg_ref) = portions[pid]
(start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
ranges.append({'raw_filename': f['raw_filename'], 'range_onset': int(start_t/1000), 'range_offset': int(end_t/1000)})
ranges = pd.DataFrame(ranges)
input = files[['recording_filename', 'raw_filename', 'time_seek', 'set']]
input['format'] = 'eaf'
ranges = ranges.merge(input, how = 'left', left_on = 'raw_filename', right_on = 'raw_filename')
am.import_annotations(ranges) Based on LAAC-LSCP/ChildProject@54cee9b |
Last result : import_eaf.py from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
import os
import pandas as pd
import glob
import pympi
project = ChildProject('.')
am = AnnotationManager(project)
files = pd.DataFrame([
{'raw_filename': os.path.join('solis', os.path.basename(f))} for f in glob.glob('raw_annotations/solis/*.eaf')
])
extract = files['raw_filename'].str.extract(r"([0-9]+_(?:CW[0-9]+|NA)_CH[0-9]+_(?:AJ|FB|LM)[0-9]+_(?:AJ|FB|LM)[0-9]+_[0-9]{6})_([0-9])?(?:.*)(?:\-|_)([A-Z]{2})")
files['recording_filename'] = extract[0] + '.WAV'
files['part'] = extract[1]
files['author'] = extract[2]
files = files.merge(project.recordings, how = 'inner', left_on = 'recording_filename', right_on = 'filename')
files['part'] = files['part'].fillna(1)
files['time_seek'] = (files['part'].astype(int)-1)*15*3600
files['set'] = 'solis_eaf_' + files['author']
ranges = []
for f in files.to_dict(orient = 'records'):
eaf = pympi.Elan.Eaf(os.path.join('raw_annotations', f['raw_filename']))
portions = eaf.tiers['code_periodic'][0]
for pid in portions:
(start_ts, end_ts, value, svg_ref) = portions[pid]
(start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
ranges.append({'raw_filename': f['raw_filename'], 'range_onset': int(start_t/1000), 'range_offset': int(end_t/1000)})
ranges = pd.DataFrame(ranges)
input = files[['recording_filename', 'raw_filename', 'time_seek', 'set']]
input['format'] = 'eaf'
ranges = ranges.merge(input, how = 'left', left_on = 'raw_filename', right_on = 'raw_filename')
am.import_annotations(ranges) |
so the first step for solomon would be "Get all stats at the level of the recording" the derived metrics that analysis outputs are explained in the section "Derived metrics" How vocalization events are “integrated”: step two: basic checks once we have all those metrics for each recording, check the correlation and error rate across the two recordings associated to each child we expect vc_chi from recording 1 and vc_chi from recording 2 to be identical step three: descriptive statistics boxplot+individual points of all the derived metrics, to check for outliers or other problems |
For the record : proc = subprocess.Popen(
[
'Rscript', 'tests/truth/extract_quantities.R',
os.path.join(project.path, 'raw_annotations', raw_rttm),
str(turntakingthresh),
'tests/truth/vc_truth_{:.1f}.csv'.format(turntakingthresh)
],
stdout = subprocess.PIPE,
stderr = subprocess.PIPE
)
stdout, stderr = proc.communicate()
truth_vc = pd.read_csv('tests/truth/vc_truth_{:.1f}.csv'.format(turntakingthresh), names = ['col','value'])
truth_vc.dropna(inplace = True)
truth_vc['metric'] = truth_vc['col'].str.split('.', expand = True)[0]
truth_vc['speaker_type'] = truth_vc['col'].str.split('.', expand = True)[1]
truth_vc['speaker_type'] = truth_vc['speaker_type'].map(am.VTC_SPEAKER_TYPE_TRANSLATION)
truth_vc.drop(columns = ['col'], inplace = True)
truth_vc = truth_vc.pivot(index = 'speaker_type', columns = 'metric').droplevel(0, axis = 1)
truth_vc.columns.name = None
truth_vc.to_csv('tests/truth/vc_truth_{:.1f}.csv'.format(turntakingthresh))
|
evaluate the stability of the procedure on pairs of recordings import os
import pandas as pd
import sys
import multiprocessing as mp
import numpy as np
from functools import partial
import sox
from matplotlib import pyplot as plt
from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
project = ChildProject(sys.argv[1])
project.read()
def get_audio_duration(filename):
if not os.path.exists(filename):
return 0
duration = 0
try:
duration = sox.file_info.duration(filename)
except:
pass
return duration
project.recordings['duration'] = project.recordings['filename'].map(lambda f:
get_audio_duration(os.path.join(project.path, 'recordings', f))
)
min_durations = project.recordings.groupby(['child_id', 'date_iso'])['duration'].min().reset_index().rename(columns = {'duration': 'min_duration'})
recordings = project.recordings.merge(min_durations, left_on = ['child_id', 'date_iso'], right_on = ['child_id', 'date_iso'])
am = AnnotationManager(project)
am.annotations = am.annotations.merge(recordings[['filename', 'min_duration']], left_on = 'recording_filename', right_on = 'filename')
vtc = am.annotations[am.annotations['set'] == 'vtc']
vtc = vtc[vtc['error'].isnull()]
def get_stats(am, af):
try:
annotation = am.annotations[am.annotations['annotation_filename'] == af]
segments = am.get_segments(annotation)
segments = am.clip_segments(segments, 0, segments['min_duration'].min())
print(af, segments.shape[0])
df = am.get_vc_stats(segments).reset_index().assign(annotation_filename = af)
except:
df = pd.DataFrame()
return df
if not os.path.exists('all_stats.csv'):
pool = mp.Pool()
all_stats = pool.map(
partial(get_stats, am),
vtc['annotation_filename'].tolist()
)
all_stats = pd.concat(all_stats)
all_stats = all_stats.merge(vtc[['annotation_filename', 'recording_filename']], how = 'left', left_on = 'annotation_filename', right_on = 'annotation_filename')
all_stats = all_stats.merge(recordings[['filename', 'child_id', 'date_iso', 'min_duration']], how = 'left', left_on = 'recording_filename', right_on = 'filename')
all_stats.to_csv('all_stats.csv', index = False)
all_stats = pd.read_csv('all_stats.csv')
undergone_merge = pd.read_csv('metadata/merged.csv')
undergone_merge['recording'] = undergone_merge['filename'].str.extract(r"([0-9]+_(?:CW[0-9]+|NA)_CH[0-9]+_(?:AJ|FB|LM)[0-9]+_(?:AJ|FB|LM)[0-9]+_[0-9]{6})_") + '.WAV'
undergone_merge.dropna(inplace = True)
all_stats = all_stats[~all_stats['filename'].isin(undergone_merge['recording'].unique())]
grouped = all_stats.groupby(['child_id', 'date_iso', 'speaker_type'])
df = grouped.agg({
'cum_dur': ['std', 'mean', 'count'],
'voc_count': ['std', 'mean'],
'turns': ['std', 'mean'],
'cds_dur': ['std', 'mean'],
'min_duration': 'min'
})
df = df[df['cum_dur']['count'] >= 2]
df = df[df['min_duration']['min'] >= 3600]
for metric in ['cum_dur', 'voc_count', 'turns', 'cds_dur']:
df[metric + '_err'] = df[metric]['std']/df[metric]['mean']
metric = 'cds_dur'
df = df.reset_index()
print(df.groupby('speaker_type')[metric + '_err'].describe(percentiles=[0.5, 0.95]))
print(df.groupby('speaker_type')[metric + '_err'].agg([np.mean, np.median]))
# df[['speaker_type', metric + '_err']].groupby('speaker_type').boxplot(column = [metric + '_err'])
df[['speaker_type', metric + '_err']].hist(by = 'speaker_type', bins = 100, histtype = 'step')
scatter = all_stats[['speaker_type', 'child_id', 'date_iso', metric]].groupby(['speaker_type', 'child_id', 'date_iso'])[metric].apply(lambda x: x.tolist())
scatter = pd.DataFrame(scatter.tolist(), index=scatter.index)\
.rename(columns=lambda x: x + 1)\
.add_prefix('err')\
.reset_index()
print(scatter)
fig, ax = plt.subplots(figsize=(8,6))
u, scatter['label_num'] = np.unique(scatter['speaker_type'], return_inverse=True)
sc = ax.scatter(x = 'err1', y = 'err2', c = 'label_num', data = scatter, marker = '+')
x = np.linspace(0, 100000, 2)
y = x
ax.plot(x, y, color = 'black', linestyle = 'dashed')
ax.legend(sc.legend_elements()[0], u, title = 'speaker_type')
ax.set_xlim(0, max(scatter['err1'].max(), scatter['err2'].max()))
ax.set_ylim(0, max(scatter['err1'].max(), scatter['err2'].max()))
ax.set_xlabel(metric + ' (recording 1)')
ax.set_ylabel(metric + ' (recording 2)')
plt.show() |
The text was updated successfully, but these errors were encountered: