|
31 | 31 |
|
32 | 32 | df_valid = pd.DataFrame(list(zip(context_id, id, context, response, response_label, type_label)), columns=[
|
33 | 33 | "context_id", "id", "context", "response", "response_label", "type_label"])
|
34 |
| - # print(df_valid) |
35 | 34 |
|
36 | 35 |
|
37 | 36 | with open("./input_raw/test_split.jsonl") as f:
|
|
48 | 47 |
|
49 | 48 | df_test = pd.DataFrame(list(zip(context_id, id, context, response, response_label, type_label)), columns=[
|
50 | 49 | "context_id", "id", "context", "response", "response_label", "type_label"])
|
51 |
| - # print(df_test) |
52 | 50 |
|
53 | 51 |
|
54 | 52 | df = pd.concat([df_valid, df_test])
|
55 |
| -# print(df.columns) |
56 |
| - |
57 | 53 | df = df.loc[df["response_label"] == "SUPPORTS"]
|
58 |
| -# print(df) |
59 |
| - |
60 | 54 | df["query"] = df["context"].astype(str) + " " + df["response"].astype(str)
|
61 |
| -# df["query"] = df["context"] |
62 | 55 | df = df.astype(str).drop_duplicates(["query"]).reset_index(drop=True)
|
63 |
| - |
64 |
| -# print(df) |
65 |
| -# print(df["response_label"].describe()) |
66 | 56 | print(df["query"])
|
67 |
| - |
68 | 57 | df.to_pickle(f"./input_processed/{DATASET}.pkl")
|
69 | 58 |
|
70 |
| - |
71 |
| -# num_chunks = ceil(df.shape[0] / CHUNK) |
72 |
| -# df_list = np.array_split(df, num_chunks) |
73 |
| - |
74 |
| -# total = 0 |
75 |
| -# for i in range(len(df_list)): |
76 |
| -# total = total + len(df_list[i]) |
77 |
| -# df_list[i].to_pickle(f"./preprocess/{i}.pkl") |
78 |
| - |
79 |
| -# print(f"Total: {total}") |
0 commit comments