-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadaptors.py
68 lines (50 loc) · 1.79 KB
/
adaptors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import re
import pandas as pd
import json
# create internal function that splits time into 'start' and 'end' times
def _times(a_string, index):
# if None string return None
if not a_string:
return None
# split string using re lib
splits = re.split(r'[, :]', a_string)
# cast to int each split
splits = [int(a_split) for index, a_split in enumerate(splits) if index != 4]
# calculate second start and end
if index == 0:
t = splits[0] * 3600 + splits[1] * 60 + splits[2] + splits[3]/1000
elif index == 1:
t = splits[4] * 3600 + splits[5] * 60 + splits[6] + splits[7]/1000
else:
t = -1
return t
# youtube transform function
def youtube2df(filepath):
# read the file
with open(filepath) as f:
file = f.read()
# split the file
lines = file.splitlines()
# compress into tuples of three, ignoring 4th
# tuples = [index, time, transcript]
tps = [(lines[i], _times(lines[i + 1], 0), _times(lines[i + 1], 1), lines[i + 2]) for i in range(0, len(lines), 4)
if i + 2 < len(lines) and _times(lines[i + 1], 0)]
# convert to dataframe
_df = pd.DataFrame(tps, columns=['orig_index', 'start', 'end', 'transcript'])
return _df
# AWS function with filepath to json function
def aws2df(filepath):
# read the file
with open(filepath) as f:
data = json.load(f)
# compress into tuples
tuples = []
for value in data['results']['items']:
if value.get('start_time'):
tuples.append((value.get('start_time'),
value.get('end_time'),
value['alternatives'][0].get('content')
))
# transform to df
_df = pd.DataFrame(tuples, columns=['start', 'end', 'transcript'])
return _df