Merge pull request oss-aspen#638 from JamesKunstle/cache-schema-updates

refactor schema names
cdolfi · Jan 31, 2024 · 0dc26d9 · 0dc26d9
2 parents c07fd63 + dd2a8bc
commit 0dc26d9
Show file tree

Hide file tree

Showing 28 changed files with 250 additions and 213 deletions.
diff --git a/8Knot/cache_manager/cache_facade.py b/8Knot/cache_manager/cache_facade.py
@@ -219,7 +219,7 @@ def retrieve_from_cache(
                 """
                 SELECT *
                 FROM {tablename} t
-                WHERE t.id IN %s;
+                WHERE t.repo_id IN %s;
                 """.format(
                     tablename=tablename
                 ),

diff --git a/8Knot/cache_manager/db_init.py b/8Knot/cache_manager/db_init.py
@@ -121,10 +121,10 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS commits_query(
-                id int,
-                commits text, -- this is the commit hash, so it's base64 hash.
+                repo_id int,
+                commit_hash text, -- this is the commit hash, so it's base64 hash.
                 author_email text,
-                date text,
+                author_date text,
                 author_timestamp text,
                 committer_timestamp text)
             """
@@ -134,15 +134,15 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS issues_query(
-                id int,
+                repo_id int,
                 repo_name text,
                 issue int,
                 issue_number int,
                 gh_issue int,
                 reporter_id text,
                 issue_closer text,
-                created text,
-                closed text
+                created_at text,
+                closed_at text
             )
             """
         )
@@ -151,14 +151,14 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS prs_query(
-                id int,
+                repo_id int,
                 repo_name text,
-                pull_request int,
+                pull_request_id int,
                 pr_src_number int,
                 cntrb_id text,
-                created text,
-                closed text,
-                merged text
+                created_at text,
+                closed_at text,
+                merged_at text
             )
             """
         )
@@ -168,8 +168,8 @@ def _create_application_tables() -> None:
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS affiliation_query(
                 cntrb_id text,
-                created text,
-                id int,
+                created_at text,
+                repo_id int,
                 login text,
                 action text,
                 rank int,
@@ -183,7 +183,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS contributors_query(
-                id int,
+                repo_id int,
                 repo_name text,
                 cntrb_id text,
                 created_at text,
@@ -199,9 +199,9 @@ def _create_application_tables() -> None:
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS issue_assignee_query(
                 issue_id text,
-                id int,
-                created text,
-                closed text,
+                repo_id int,
+                created_at text,
+                closed_at text,
                 assign_date text,
                 assignment_action text,
                 assignee text
@@ -214,9 +214,9 @@ def _create_application_tables() -> None:
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS pr_assignee_query(
                 pull_request_id int,
-                id int,
-                created text,
-                closed text,
+                repo_id int,
+                created_at text,
+                closed_at text,
                 assign_date text,
                 assignment_action text,
                 assignee text
@@ -229,7 +229,7 @@ def _create_application_tables() -> None:
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS cntrb_per_file_query(
                 file_path text,
-                id int,
+                repo_id int,
                 cntrb_ids text
             )
             """
@@ -240,8 +240,8 @@ def _create_application_tables() -> None:
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS pr_file_query(
                 file_path text,
-                pull_request int,
-                id int
+                pull_request_id int,
+                repo_id int
             )
             """
         )
@@ -250,7 +250,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS repo_files_query(
-                id int,
+                repo_id int,
                 repo_name text,
                 repo_path text,
                 rl_analysis_date text,
@@ -264,7 +264,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS repo_languages_query(
-                id int,
+                repo_id int,
                 programming_language text,
                 code_lines int,
                 files int
@@ -276,7 +276,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS package_version_query(
-                id int,
+                repo_id int,
                 name text,
                 current_release_date text,
                 latest_release_date text,
@@ -289,7 +289,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS repo_releases_query(
-                id int,
+                repo_id int,
                 release_name text,
                 release_created_at text,
                 release_published_at text,
@@ -302,7 +302,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS ossf_score_query(
-                id int,
+                repo_id int,
                 name text,
                 score float4
             )
@@ -313,7 +313,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS repo_info_query(
-                id int,
+                repo_id int,
                 issues_enabled text,
                 fork_count int,
                 watchers_count int,
@@ -331,7 +331,7 @@ def _create_application_tables() -> None:
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS pr_response_query(
                 pull_request_id int,
-                ID int,
+                repo_id int,
                 cntrb_id text,
                 msg_timestamp text,
                 msg_cntrb_id text,

diff --git a/8Knot/pages/affiliation/visualizations/gh_org_affiliation.py b/8Knot/pages/affiliation/visualizations/gh_org_affiliation.py
@@ -168,16 +168,16 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
     requiring no further processing."""
 
     # convert to datetime objects rather than strings
-    df["created"] = pd.to_datetime(df["created"], utc=True)
+    df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
 
     # order values chronologically by COLUMN_TO_SORT_BY date
-    df = df.sort_values(by="created", axis=0, ascending=True)
+    df = df.sort_values(by="created_at", axis=0, ascending=True)
 
     # filter values based on date picker
     if start_date is not None:
-        df = df[df.created >= start_date]
+        df = df[df.created_at >= start_date]
     if end_date is not None:
-        df = df[df.created <= end_date]
+        df = df[df.created_at <= end_date]
 
     # intital count of same company name in github profile
     result = df.cntrb_company.value_counts(dropna=False)

diff --git a/8Knot/pages/affiliation/visualizations/org_associated_activity.py b/8Knot/pages/affiliation/visualizations/org_associated_activity.py
@@ -82,8 +82,14 @@
                                     dbc.Checklist(
                                         id=f"email-filter-{PAGE}-{VIZ_ID}",
                                         options=[
-                                            {"label": "Exclude Gmail", "value": "gmail"},
-                                            {"label": "Exclude GitHub", "value": "github"},
+                                            {
+                                                "label": "Exclude Gmail",
+                                                "value": "gmail",
+                                            },
+                                            {
+                                                "label": "Exclude GitHub",
+                                                "value": "github",
+                                            },
                                         ],
                                         value=[""],
                                         inline=True,
@@ -201,16 +207,16 @@ def org_associated_activity_graph(repolist, num, start_date, end_date, email_fil
 
 def process_data(df: pd.DataFrame, num, start_date, end_date, email_filter):
     # convert to datetime objects rather than strings
-    df["created"] = pd.to_datetime(df["created"], utc=True)
+    df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
 
     # order values chronologically by COLUMN_TO_SORT_BY date
-    df = df.sort_values(by="created", axis=0, ascending=True)
+    df = df.sort_values(by="created_at", axis=0, ascending=True)
 
     # filter values based on date picker
     if start_date is not None:
-        df = df[df.created >= start_date]
+        df = df[df.created_at >= start_date]
     if end_date is not None:
-        df = df[df.created <= end_date]
+        df = df[df.created_at <= end_date]
 
     # creates list of emails for each contribution and flattens list result
     emails = df.email_list.str.split(" , ").explode("email_list").tolist()

diff --git a/8Knot/pages/affiliation/visualizations/org_core_contributors.py b/8Knot/pages/affiliation/visualizations/org_core_contributors.py
@@ -107,8 +107,14 @@
                                     dbc.Checklist(
                                         id=f"email-filter-{PAGE}-{VIZ_ID}",
                                         options=[
-                                            {"label": "Exclude Gmail", "value": "gmail"},
-                                            {"label": "Exclude GitHub", "value": "github"},
+                                            {
+                                                "label": "Exclude Gmail",
+                                                "value": "gmail",
+                                            },
+                                            {
+                                                "label": "Exclude GitHub",
+                                                "value": "github",
+                                            },
                                         ],
                                         value=[""],
                                         inline=True,
@@ -165,7 +171,13 @@ def toggle_popover(n, is_open):
     background=True,
 )
 def compay_associated_activity_graph(
-    repolist, contributions, contributors, start_date, end_date, email_filter, bot_switch
+    repolist,
+    contributions,
+    contributors,
+    start_date,
+    end_date,
+    email_filter,
+    bot_switch,
 ):
     # wait for data to asynchronously download and become available.
     while not_cached := cf.get_uncached(func_name=aq.__name__, repolist=repolist):
@@ -201,23 +213,23 @@ def compay_associated_activity_graph(
 
 def process_data(df: pd.DataFrame, contributions, contributors, start_date, end_date, email_filter):
     # convert to datetime objects rather than strings
-    df["created"] = pd.to_datetime(df["created"], utc=True)
+    df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
 
     # order values chronologically by COLUMN_TO_SORT_BY date
-    df = df.sort_values(by="created", axis=0, ascending=True)
+    df = df.sort_values(by="created_at", axis=0, ascending=True)
 
     # filter values based on date picker
     if start_date is not None:
-        df = df[df.created >= start_date]
+        df = df[df.created_at >= start_date]
     if end_date is not None:
-        df = df[df.created <= end_date]
+        df = df[df.created_at <= end_date]
 
     # groups contributions by countributor id and counts, created column now hold the number
     # of contributions for its respective contributor
-    df = df.groupby(["cntrb_id", "email_list"], as_index=False)[["created"]].count()
+    df = df.groupby(["cntrb_id", "email_list"], as_index=False)[["created_at"]].count()
 
     # filters out contributors that dont meet the core contribution threshhold
-    df = df[df.created >= contributions]
+    df = df[df.created_at >= contributions]
 
     # creates list of unique emails and flattens list result
     emails = df.email_list.str.split(" , ").explode("email_list").tolist()

diff --git a/8Knot/pages/affiliation/visualizations/unqiue_domains.py b/8Knot/pages/affiliation/visualizations/unqiue_domains.py
@@ -165,16 +165,16 @@ def unique_domains_graph(repolist, num, start_date, end_date, bot_switch):
 
 def process_data(df: pd.DataFrame, num, start_date, end_date):
     # convert to datetime objects rather than strings
-    df["created"] = pd.to_datetime(df["created"], utc=True)
+    df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
 
     # order values chronologically by COLUMN_TO_SORT_BY date
-    df = df.sort_values(by="created", axis=0, ascending=True)
+    df = df.sort_values(by="created_at", axis=0, ascending=True)
 
     # filter values based on date picker
     if start_date is not None:
-        df = df[df.created >= start_date]
+        df = df[df.created_at >= start_date]
     if end_date is not None:
-        df = df[df.created <= end_date]
+        df = df[df.created_at <= end_date]
 
     # creates list of unique emails and flattens list result
     emails = df.email_list.str.split(" , ").explode("email_list").unique().tolist()

diff --git a/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py b/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py
@@ -180,7 +180,7 @@ def directory_dropdown(repo_id):
     # strings to hold the values for each column (always the same for every row of this query)
     repo_name = df["repo_name"].iloc[0]
     repo_path = df["repo_path"].iloc[0]
-    repo_id = str(df["id"].iloc[0])
+    repo_id = str(df["repo_id"].iloc[0])
 
     # pattern found in each file path, used to slice to get only the root file path
     path_slice = repo_id + "-" + repo_path + "/" + repo_name + "/"
@@ -192,7 +192,7 @@ def directory_dropdown(repo_id):
     # drop unneccessary columns not needed after preprocessing steps
     df = df.reset_index()
     df.drop(
-        ["index", "id", "repo_name", "repo_path", "rl_analysis_date"],
+        ["index", "repo_id", "repo_name", "repo_path", "rl_analysis_date"],
         axis=1,
         inplace=True,
     )
@@ -308,7 +308,7 @@ def process_data(
     # strings to hold the values for each column (always the same for every row of this query)
     repo_name = df_file["repo_name"].iloc[0]
     repo_path = df_file["repo_path"].iloc[0]
-    repo_id = str(df_file["id"].iloc[0])
+    repo_id = str(df_file["repo_id"].iloc[0])
 
     # pattern found in each file path, used to slice to get only the root file path
     path_slice = repo_id + "-" + repo_path + "/" + repo_name + "/"
@@ -322,8 +322,8 @@ def process_data(
     df_file = df_file.join(df_file["file_path"].str.split("/", expand=True))
 
     # drop unnecessary columns
-    df_file.drop(["id"], axis=1, inplace=True)
-    df_file_cntbs.drop(["id"], axis=1, inplace=True)
+    df_file.drop(["repo_id"], axis=1, inplace=True)
+    df_file_cntbs.drop(["repo_id"], axis=1, inplace=True)
 
     # Left join on df_files to only get the files that are currently in the repository
     # and the contributors that have ever opened a pr that included edits on the file
@@ -387,7 +387,11 @@ def process_data(
 
     # drop unneccessary columns not needed after preprocessing steps
     df_actions = df_actions.reset_index()
-    df_actions.drop(["index", "id", "repo_name", "login", "Action", "rank"], axis=1, inplace=True)
+    df_actions.drop(
+        ["index", "repo_id", "repo_name", "login", "Action", "rank"],
+        axis=1,
+        inplace=True,
+    )
 
     # dictionary of cntrb_ids and their most recent activity on repo
     last_contrb = df_actions.set_index("cntrb_id")["created_at"].to_dict()