filter reviews without featuees

nlp4se · Dec 3, 2024 · 4a662ea · 4a662ea
1 parent 7497257
commit 4a662ea
Show file tree

Hide file tree

Showing 7 changed files with 2 additions and 49,168 deletions.
diff --git a/data/Stage 1 - Feature extraction/output/com.discord.csv b/data/Stage 1 - Feature extraction/output/com.discord.csv
diff --git a/data/Stage 1 - Feature extraction/output/com.google.android.apps.bard.csv b/data/Stage 1 - Feature extraction/output/com.google.android.apps.bard.csv
diff --git a/data/Stage 1 - Feature extraction/output/com.microsoft.copilot.csv b/data/Stage 1 - Feature extraction/output/com.microsoft.copilot.csv
diff --git a/data/Stage 1 - Feature extraction/output/com.openai.chatgpt.csv b/data/Stage 1 - Feature extraction/output/com.openai.chatgpt.csv
diff --git a/data/Stage 1 - Feature extraction/output/com.whatsapp.csv b/data/Stage 1 - Feature extraction/output/com.whatsapp.csv
diff --git a/data/Stage 1 - Feature extraction/output/org.telegram.messenger.csv b/data/Stage 1 - Feature extraction/output/org.telegram.messenger.csv
diff --git a/scripts/Stage 1 - Feature extraction/review_postprocessing.py b/scripts/Stage 1 - Feature extraction/review_postprocessing.py
@@ -45,7 +45,8 @@ def parse_and_add_column(csv_folder, json_folder, output_folder):
 
         # Add the 'extracted_features_TransFeatEx' column
         df['extracted_features_TransFeatEx'] = df['reviewId'].map(review_features_map)
-
+        # Filter out rows where extracted_features_TransFeatEx is empty
+        df = df[df['extracted_features_TransFeatEx'].notna() & (df['extracted_features_TransFeatEx'] != '')]
         # Save the updated DataFrame to the output folder
         output_path = os.path.join(output_folder, csv_file)
         df.to_csv(output_path, sep=',', index=False)