bt benchmark: Use 0 as ymin for raw value plot
[lttng-ci.git] / scripts / babeltrace-benchmark / benchmark.py
1 #!/usr/bin/python3
2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 import json
18 import os
19 import tempfile
20 from statistics import mean
21 import argparse
22 import sys
23 from operator import add
24
25 import matplotlib.pyplot as plt
26 from matplotlib.backends.backend_pdf import PdfPages
27 from matplotlib.ticker import PercentFormatter
28
29 import git
30 import numpy
31 import lava_submit
32
33 from minio import Minio
34 from minio.error import NoSuchKey
35 from minio.error import ResponseError
36
37
38 BENCHMARK_TYPES = ["dummy", "text"]
39 DEFAULT_BUCKET = "lava"
40
41
42 def graph_get_color(branch):
43 """
44 Get the color matching the branch.
45 """
46 color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
47 return color[branch]
48
49
50 def graph_get_title(branch, benchmark_type):
51 """
52 Get title for graph based on benchmark type.
53 """
54 string = {"dummy": "Dummy output", "text": "Text output"}
55 return "{} - {}".format(branch, string[benchmark_type])
56
57
58 def get_client():
59 """
60 Return minio client configured.
61 """
62 return Minio(
63 "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
64 )
65
66
67 def get_file(client, prefix, file_name, workdir_name):
68 """
69 Return the path of the downloaded file.
70 Return None on error
71 """
72 destination = os.path.join(workdir_name, file_name)
73 object_name = "{}/{}".format(prefix, file_name)
74 try:
75 client.fget_object(DEFAULT_BUCKET, object_name, destination)
76 except NoSuchKey:
77 return None
78
79 return destination
80
81
82 def delete_file(client, prefix, file_name):
83 """
84 Delete the file on remote.
85 """
86 object_name = "{}/{}".format(prefix, file_name)
87 try:
88 client.remove_object(DEFAULT_BUCKET, object_name)
89 except ResponseError as err:
90 print(err)
91 except NoSuchKey:
92 pass
93
94
95 def get_git_log(bt_version, cutoff, repo_path):
96 """
97 Return an ordered (older to newer) list of commits for the bt_version and
98 cutoff. WARNING: This changes the git repo HEAD.
99 """
100 repo = git.Repo(repo_path)
101 repo.git.fetch()
102 return repo.git.log(
103 "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
104 ).split("\n")
105
106
107 def parse_result(result_path):
108 """
109 Parse the result file. Return a dataset of User time + System time.
110 """
111 with open(result_path) as result:
112 parsed_result = json.load(result)
113 return list(
114 map(
115 add,
116 parsed_result["User time (seconds)"],
117 parsed_result["System time (seconds)"],
118 )
119 )
120
121
122 def get_benchmark_results(client, commit, workdir):
123 """
124 Fetch the benchmark result from a certain commit across all benchmark type.
125 """
126 results = {}
127 benchmark_valid = True
128 for b_type in BENCHMARK_TYPES:
129 prefix = "/results/benchmarks/babeltrace/{}/".format(b_type)
130 result_file = get_file(client, prefix, commit, workdir)
131 if not result_file:
132 """
133 Benchmark is either corrupted or not complete.
134 """
135 return None, benchmark_valid
136 results[b_type] = parse_result(result_file)
137 if all(i == 0.0 for i in results[b_type]):
138 benchmark_valid = False
139 print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
140 # The dataset is valid return immediately.
141 return results, benchmark_valid
142
143
144 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
145 """
146 Plot the graph using the raw value.
147 """
148 point_x_data = []
149 outlier_x_data = []
150 point_y_data = []
151 outlier_y_data = []
152 for pos in range(len(x_data)):
153 x = x_data[pos]
154 valid_points, outliers = sanitize_dataset(y_data[pos])
155 for y in valid_points:
156 point_x_data.append(x)
157 point_y_data.append(y)
158 for y in outliers:
159 outlier_x_data.append(x)
160 outlier_y_data.append(y)
161
162 plt.plot(
163 point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
164 )
165 plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
166
167 ymax = 1
168 if y_data:
169 ymin = 0.8 * min([item for sublist in y_data for item in sublist])
170 ymax = 1.2 * max([item for sublist in y_data for item in sublist])
171 # Put latest of other branches for reference as horizontal line.
172 for l_branch, l_result in latest_values.items():
173 if not l_result or l_branch == branch:
174 continue
175 plt.axhline(
176 y=l_result,
177 label="Latest {}".format(l_branch),
178 color=graph_get_color(l_branch),
179 )
180 if l_result >= ymax:
181 ymax = 1.2 * l_result
182 plt.ylim(ymin=0, ymax=ymax)
183 plt.xticks(x_data, labels, rotation=90, family="monospace")
184 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
185 plt.ylabel("User + system time (s)")
186 plt.xlabel("Latest commits")
187 plt.legend()
188
189 plt.tight_layout()
190 return
191
192
193 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
194 """
195 Plot the graph using a ratio using first point as reference (0%).
196 """
197 reference = 0.01
198 y_abs_max = 100
199
200 if y_data:
201 reference = y_data[0]
202
203 # Transform y_data to a list of ratio for which the reference is the first
204 # element.
205 local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
206
207 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
208
209 # Put latest of other branches for reference as horizontal line.
210 for l_branch, l_result in latest_values.items():
211 if not l_result or l_branch == branch:
212 continue
213 ratio_l_result = ((l_result / reference) - 1.0) * 100.0
214 print(
215 "branch {} branch {} value {} l_result {} reference {}".format(
216 branch, l_branch, ratio_l_result, l_result, reference
217 )
218 )
219 plt.axhline(
220 y=ratio_l_result,
221 label="Latest {}".format(l_branch),
222 color=graph_get_color(l_branch),
223 )
224
225 # Draw the reference line.
226 plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
227
228 # Get max absolute value to align the y axis with zero in the middle.
229 if local_y_data:
230 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
231 if y_abs_max > 100:
232 y_abs_max = local_abs_max
233
234 plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
235
236 ax = plt.gca()
237 percent_formatter = PercentFormatter()
238 ax.yaxis.set_major_formatter(percent_formatter)
239 ax.yaxis.set_minor_formatter(percent_formatter)
240 plt.xticks(x_data, labels, rotation=90, family="monospace")
241 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
242 plt.ylabel("Ratio")
243 plt.xlabel("Latest commits")
244 plt.legend()
245
246 plt.tight_layout()
247 return
248
249
250 def generate_graph(branches, report_name, git_path):
251
252 # The PDF document
253 pdf_pages = PdfPages(report_name)
254
255 client = get_client()
256 branch_results = dict()
257
258 # Fetch the results for each branch.
259 for branch, cutoff in branches.items():
260 commits = get_git_log(branch, cutoff, git_path)
261 results = []
262 with tempfile.TemporaryDirectory() as workdir:
263 for commit in commits:
264 b_results, valid = get_benchmark_results(client, commit, workdir)
265 if not b_results or not valid:
266 continue
267 results.append((commit, b_results))
268 branch_results[branch] = results
269
270 for b_type in BENCHMARK_TYPES:
271 latest_values = {}
272 max_len = 0
273
274 # Find the maximum size for a series inside our series dataset.
275 # This is used later to compute the size of the actual plot (pdf).
276 # While there gather the comparison value used to draw comparison line
277 # between branches.
278 for branch, results in branch_results.items():
279 max_len = max([max_len, len(results)])
280 if results:
281 latest_values[branch] = mean(
282 sanitize_dataset(results[-1][1][b_type])[0]
283 )
284 else:
285 latest_values[branch] = None
286
287 for branch, results in branch_results.items():
288 # Create a figure instance
289 if max_len and max_len > 10:
290 width = 0.16 * max_len
291 else:
292 width = 11.69
293
294 x_data = list(range(len(results)))
295 y_data = [c[1][b_type] for c in results]
296 labels = [c[0][:8] for c in results]
297
298 fig = plt.figure(figsize=(width, 8.27), dpi=100)
299 plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
300 pdf_pages.savefig(fig)
301
302 fig = plt.figure(figsize=(width, 8.27), dpi=100)
303 # Use the mean of each sanitize dataset here, we do not care for
304 # variance for ratio. At least not yet.
305 y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
306 plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
307 pdf_pages.savefig(fig)
308
309 pdf_pages.close()
310
311
312 def launch_jobs(branches, git_path, wait_for_completion, debug):
313 """
314 Lauch jobs for all missing results.
315 """
316 client = get_client()
317 for branch, cutoff in branches.items():
318 commits = get_git_log(branch, cutoff, git_path)
319
320 with tempfile.TemporaryDirectory() as workdir:
321 for commit in commits:
322 b_results = get_benchmark_results(client, commit, workdir)[0]
323 if b_results:
324 continue
325 lava_submit.submit(
326 commit, wait_for_completion=wait_for_completion, debug=debug
327 )
328
329
330 def main():
331 """
332 Parse arguments and execute as needed.
333 """
334 bt_branches = {
335 "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
336 "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
337 "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
338 }
339
340 parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
341 parser.add_argument(
342 "--generate-jobs", action="store_true", help="Generate and send jobs"
343 )
344 parser.add_argument(
345 "--do-not-wait-on-completion",
346 action="store_true",
347 default=False,
348 help="Wait for the completion of each jobs sent. This is useful"
349 "for the ci. Otherwise we could end up spaming the lava instance.",
350 )
351 parser.add_argument(
352 "--generate-report",
353 action="store_true",
354 help="Generate graphs and save them to pdf",
355 )
356 parser.add_argument(
357 "--report-name", default="report.pdf", help="The name of the pdf report."
358 )
359 parser.add_argument(
360 "--debug", action="store_true", default=False, help="Do not send jobs to lava."
361 )
362 parser.add_argument(
363 "--repo-path", help="The location of the git repo to use.", required=True
364 )
365
366 args = parser.parse_args()
367
368 if not os.path.exists(args.repo_path):
369 print("Repository location does not exists.")
370 return 1
371
372 if args.generate_jobs:
373 print("Launching jobs for:")
374 for branch, cutoff in bt_branches.items():
375 print("\t Branch {} with cutoff {}".format(branch, cutoff))
376 launch_jobs(
377 bt_branches, args.repo_path, not args.do_not_wait_on_completion, args.debug
378 )
379
380 if args.generate_report:
381 print("Generating pdf report ({}) for:".format(args.report_name))
382 for branch, cutoff in bt_branches.items():
383 print("\t Branch {} with cutoff {}".format(branch, cutoff))
384 generate_graph(bt_branches, args.report_name, args.repo_path)
385
386 return 0
387
388
389 def sanitize_dataset(dataset):
390 """
391 Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
392 representative mean without outlier in it.
393 [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
394 """
395 sorted_data = sorted(dataset)
396 q1, q3 = numpy.percentile(sorted_data, [25, 75])
397 iqr = q3 - q1
398 lower_bound = q1 - (1.5 * iqr)
399 upper_bound = q3 + (1.5 * iqr)
400 new_dataset = []
401 outliers = []
402 for i in dataset:
403 if lower_bound <= i <= upper_bound:
404 new_dataset.append(i)
405 else:
406 outliers.append(i)
407 return new_dataset, outliers
408
409
410 if __name__ == "__main__":
411 sys.exit(main())
This page took 0.041294 seconds and 5 git commands to generate.