-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathgenerate_motion.rb
198 lines (185 loc) · 5.06 KB
/
generate_motion.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
require 'csv'
require 'pry'
require './comment'
# Generate merged motion data for files listed in "flist" file (an their motion labels)
# Output to "fout_motion" (per label) and cumulative sums "fout_motion_sums"
def generate(flist, fout_motion, fout_motion_sums, fsummary)
sort_column = 'authors'
summaries = {}
if fsummary
CSV.foreach(fsummary, headers: true) do |row|
next if is_comment row
h = row.to_h
summaries[h['project']] = h
end
end
files = []
CSV.foreach(flist, headers: true) do |row|
next if is_comment row
h = row.to_h
name = h['name'].strip
label = h['label'].strip
files << [name, label]
end
# Read data from files and labels list in "flist"
projects = {}
labels = {}
files.each do |file_data|
file, label = file_data
CSV.foreach(file, headers: true) do |row|
next if is_comment row
h = row.to_h
h.each do |p, v|
vi = v.to_i
vis = vi.to_s
# Convert string that contian integers to integers
h[p] = vi if vis == v
end
project = h['project']
h['label'] = label
projects[project] = {} unless projects.key? project
projects[project][label] = h
labels[label] = true
end
end
# Labels should be alpabetical (actually google sheet reuires time data)
# So I suggest YYYYMM or YYYY-MM etc, MM/YYYY sorted alphabetically will give wrong result
# 1/2017 < 2/2016
labels = labels.keys
summary_map = {}
summary_map[true] = 0
summary_map[false] = 0
# Compute sums
projects.each do |project, items|
have_summary = summaries.key? project
summary_map[have_summary] += 1
sum = {}
cum_labels = []
labels.each do |label|
proj = items[label]
next unless proj
cum_labels << label
proj.each do |k, v|
next if ['org', 'repo'].include? k
if ['activity', 'comments', 'prs', 'commits', 'issues'].include? k
sum[k] = 0 unless sum.key? k
sum[k] += v
elsif ['project', 'url'].include? k
sum[k] = v
elsif k == 'authors'
# Column authors is not summed but max'ed
if have_summary
sum[k] = summaries[project]['authors'].to_i
else
sum[k] = v unless sum.key? k
sum[k] = [sum[k], v].max
end
elsif k == 'label'
sum[k] = [] unless sum.key? k
sum[k] << v
else
puts "Invalid key #{k}"
p proj
end
end
# This is nasty
items[[label]] = [cum_labels.dup, sum.dup]
end
items[:sum] = sum
end
# Sort by sort_column (sum of data from all data files)
# It determines top projects
projs_arr = []
projects.each do |project, items|
projs_arr << [project, items[:sum][sort_column], items]
end
projs_arr = projs_arr.sort_by { |item| -item[1] }
# Only put project in output if it have data in all labels
top_projs = []
n = 0
indices = []
projs_arr.each_with_index do |item, index|
lbls = item[2][:sum]['label']
if lbls.size == labels.size
n += 1
if n <= 30 && item[2][:sum]['url'] == ''
puts "Project ##{n} '#{item[0]}' is missing URL"
end
top_projs << item
indices << index
end
end
# To check which projects got to final motion
# Uncomment this:
# p indices[0..29]
# Motion chart data
ks = %w(project url label activity comments prs commits issues authors)
ks += %w(sum_activity sum_comments sum_prs sum_commits sum_issues sum_authors)
CSV.open(fout_motion, "w", headers: ks) do |csv|
csv << ks
top_projs.each do |item|
proj = item[0]
sum = item[2][:sum]
authors = 0
labels.each do |label|
row = item[2][label]
authors = [authors, row['authors']].max
csv_row = [
proj,
row['url'],
label,
row['activity'],
row['comments'],
row['prs'],
row['commits'],
row['issues'],
authors,
sum['activity'],
sum['comments'],
sum['prs'],
sum['commits'],
sum['issues'],
sum['authors']
]
csv << csv_row
end
end
end
# Cumulative sums
CSV.open(fout_motion_sums, "w", headers: ks) do |csv|
csv << ks
top_projs.each do |item|
proj = item[0]
sum = item[2][:sum]
authors = 0
labels.each do |label|
# sum_labels = item[2][[label]][0]
# puts "#{proj} #{sum_labels}"
row = item[2][[label]][1]
csv_row = [
proj,
row['url'],
label,
row['activity'],
row['comments'],
row['prs'],
row['commits'],
row['issues'],
row['authors'],
sum['activity'],
sum['comments'],
sum['prs'],
sum['commits'],
sum['issues'],
sum['authors']
]
csv << csv_row
end
end
end
end
if ARGV.size < 3
puts "Missing arguments: files.csv motion.csv motion_sums.csv [summary.csv]"
exit(1)
end
generate(ARGV[0], ARGV[1], ARGV[2], ARGV[3])