Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft flow PR #127

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
263 changes: 263 additions & 0 deletions veniq/dataset_collection/dataflow/collect_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
import hashlib
import os
import re
from argparse import ArgumentParser
from functools import partial
from pathlib import Path

import javalang
from javalang.parse import parse
import d6tflow
import pandas as pd

import d6tcollect

# from veniq.dataset_collection.augmentation import InvocationType
from pebble import ProcessPool
from tqdm import tqdm

from veniq.ast_framework import AST, ASTNodeType
from veniq.utils.ast_builder import build_ast
from veniq.utils.encoding_detector import read_text_with_autodetected_encoding

d6tcollect.submit = False

# columns = [
# 'project_id',
# 'original_filename',
# 'class_name',
# 'invocation_line_string',
# 'invocation_line_number_in_original_file',
# 'target_method',
# 'target_method_start_line',
# 'extract_method',
# 'extract_method_start_line',
# 'extract_method_end_line',
# 'output_filename',
# 'is_valid_ast',
# 'insertion_start',
# 'insertion_end',
# 'ncss_target',
# 'ncss_extracted',
# 'do_nothing',
# 'ONE_LINE_FUNCTION',
# 'NO_IGNORED_CASES'
# ] + [x for x in InvocationType.list_types()]


class TaskAggregatorJavaFiles(d6tflow.tasks.TaskCache):
# dir_to_search = d6tflow.Parameter()
# dir_to_save = d6tflow.Parameter()
# system_cores_qty = d6tflow.IntParameter()
# files_without_tests = d6tflow.ListParameter()
file = d6tflow.Parameter()

def run(self):
# test_files = set(Path(self.dir_to_search).glob('**/*Test*.java'))
# not_test_files = set(Path(self.dir_to_search).glob('**/*.java'))
# files_without_tests = list(not_test_files.difference(test_files))
# if not files_without_tests:
# raise Exception("No java files were found")
#
# full_dataset_folder = Path(self.dir_to_save) / 'full_dataset'
# self.output_dir = full_dataset_folder / 'output_files'
# self.input_dir = full_dataset_folder / 'input_files'
# # df = pd.DataFrame(columns=columns)
# results = []
# # with ProcessPool(system_cores_qty) as executor:
# #
# # future = executor.map(TaskPreprocessJavaFile.run, timeout=200, )
# # result = future.result()
# #
# # # each 100 cycles we dump the results
# # iteration_cycle = 1000
# # iteration_number = 0
# # for filename in tqdm(files_without_tests):
# # try:
# # # print(filename)
# # single_file_features = next(result)
# # if single_file_features:
# # results.append(single_file_features)
# # except Exception as e:
# # print(e)
results = []
# for x in self.files_without_tests:
# # d = {j: '' for j in columns}
# # d.update({'original_filename': x})
# # df = df.append(d, ignore_index=True)
# # self.save({'file': str(x)})
a = yield TaskPreprocessJavaFile(file=str(self.file))
results.append(a)
self.save({"results": results})


# @d6tflow.requires({'file': TaskAggregatorJavaFiles})
class TaskPreprocessJavaFile(d6tflow.tasks.TaskCache):
file = d6tflow.Parameter()

# target_class = d6tflow.targets.TaskPickle
# target_ext = 'pkl'
# persist = ['data']
def _remove_comments(self, string: str):
pattern = r"(\".*?\"|\'.*?\')|(/\*.*?\*/|//[^\r\n]*$)"
# first group captures quoted strings (double or single)
# second group captures comments (//single-line or /* multi-line */)
regex = re.compile(pattern, re.MULTILINE | re.DOTALL)

def _replacer(match):
# if the 2nd group (capturing comments) is not None,
# it means we have captured a non-quoted (real) comment string.
if match.group(2) is not None:
# so we will return empty to remove the comment
return ""
else: # otherwise, we will return the 1st group
return match.group(1) # captured quoted-string

return regex.sub(_replacer, string)

# def save_text_to_new_file(
# self,
# input_dir: Path,
# text: str, filename: Path):
# # need to avoid situation when filenames are the same
# hash_path = hashlib.sha256(str(filename.parent).encode('utf-8')).hexdigest()
# dst_filename = input_dir / f'{filename.stem}_{hash_path}.java'
# # if not dst_filename.parent.exists():
# # dst_filename.parent.mkdir(parents=True)
# # if not dst_filename.exists():
# # with open(dst_filename, 'w', encoding='utf-8') as w:
# # w.write(text)
#
# return dst_filename, text

# @classmethod
def run(self):
# file = self.inputLoad()['file']['file']
print(self.file)
# file = files['file'].absolute()
# print(f'TaskPreprocessJavaFile {df.shape[0]}')
original_text = read_text_with_autodetected_encoding(str(self.file))
# remove comments
text_without_comments = self._remove_comments(original_text)
# remove whitespaces
text = "\n".join([ll.rstrip() for ll in text_without_comments.splitlines() if ll.strip()])
try:
ast = AST.build_from_javalang(parse(text))
self.save({'text': text, 'ast': ast})
except javalang.parser.JavaSyntaxError:
pass
# for _, x in df.iterrows():
# # print(x)
# x['output_file'] = 'XXXX'
# original_text = read_text_with_autodetected_encoding(str(self.file))
# # remove comments
# text_without_comments = self._remove_comments(original_text)
# # remove whitespaces
# text = "\n".join([ll.rstrip() for ll in text_without_comments.splitlines() if ll.strip()])
# ast = AST.build_from_javalang(parse(text))
#
# self.save()


#@d6tflow.requires({'input1': TaskPreprocessJavaFile})
# class TaskFindEM(d6tflow.tasks.TaskJson):
# # ast_tree = AST
# # text = d6tflow.Parameter()
#
# def __init__(self, *args, **kwargs):
# kwargs_ = {k: v for k, v in kwargs.items() if k in self.get_param_names()}
# super().__init__(*args, **kwargs_)
#
# def run(self):
# input1 = self.inputLoad()['input1']
#
# class_name = [x for x in input1['ast_tree'].get_proxy_nodes(ASTNodeType.CLASS_DECLARATION)][0]
# #print(class_name)
# self.save({"class_name": class_name})


def retunrn(i):
if i == 1:
return []


def get_files(dir_to_search, dir_to_save, system_cores_qty):
test_files = set(Path(dir_to_search).glob('**/*Test*.java'))
not_test_files = set(Path(dir_to_search).glob('**/*.java'))
files_without_tests = list(not_test_files.difference(test_files))
if not files_without_tests:
raise Exception("No java files were found")

full_dataset_folder = Path(dir_to_save) / 'full_dataset'
output_dir = full_dataset_folder / 'output_files'
input_dir = full_dataset_folder / 'input_files'
# df = pd.DataFrame(columns=columns)
results = []
with ProcessPool(system_cores_qty) as executor:

# d6tflow.preview(TaskAggregatorJavaFiles(dir_to_search=args.dir, dir_to_save=args.output, system_cores_qty=args.jobs))
# d6tflow.run(TaskAggregatorJavaFiles(dir_to_search=args.dir, dir_to_save=args.output, system_cores_qty=args.jobs))
# data = TaskAggregatorJavaFiles(dir_to_search=args.dir, dir_to_save=args.output, system_cores_qty=args.jobs).outputLoad(cached=False)
tasks = [TaskAggregatorJavaFiles(file=str(x)) for x in files_without_tests]
future = executor.map(d6tflow.run, tasks, timeout=200, )
# future = executor.map(retunrn, [x for x in range(4)], )
result = future.result()

# each 100 cycles we dump the results
iteration_cycle = 1000
iteration_number = 0
for filename in tqdm(files_without_tests):
try:
# print(filename)
next(result)
data = TaskAggregatorJavaFiles(file=str(filename)).outputLoad()
if data:
results.append(data)
except Exception as e:
print(e)
print(results)


if __name__ == '__main__':
# Intelligently rerun workflow after changing parameters
system_cores_qty = os.cpu_count() or 1
parser = ArgumentParser()
parser.add_argument(
"-d",
"--dir",
required=True,
help="File path to JAVA source code for methods augmentations"
)
parser.add_argument(
"-o", "--output",
help="Path for file with output results",
default='augmented_data'
)
parser.add_argument(
"--jobs",
"-j",
type=int,
default=system_cores_qty - 1,
help="Number of processes to spawn. "
"By default one less than number of cores. "
"Be careful to raise it above, machine may stop responding while creating dataset.",
)
parser.add_argument(
"-z", "--zip",
action='store_true',
help="To zip input and output files."
)
parser.add_argument(
"-s", "--small_dataset_size",
help="Number of files in small dataset",
default=100,
type=int,
)

args = parser.parse_args()
get_files(dir_to_search=args.dir, dir_to_save=args.output, system_cores_qty=args.jobs)
# d6tflow.preview(TaskAggregatorJavaFiles(dir_to_search=args.dir, dir_to_save=args.output, system_cores_qty=args.jobs))
# d6tflow.run(TaskAggregatorJavaFiles(dir_to_search=args.dir, dir_to_save=args.output, system_cores_qty=args.jobs))
# data = TaskAggregatorJavaFiles(dir_to_search=args.dir, dir_to_save=args.output, system_cores_qty=args.jobs).outputLoad(cached=False)
# print(data)
# See PyCharm help at https://www.jetbrains.com/help/pycharm/