Skip to content

Commit

Permalink
Merge pull request oss-aspen#638 from JamesKunstle/cache-schema-updates
Browse files Browse the repository at this point in the history
refactor schema names
  • Loading branch information
cdolfi authored Jan 31, 2024
2 parents c07fd63 + dd2a8bc commit 0dc26d9
Show file tree
Hide file tree
Showing 28 changed files with 250 additions and 213 deletions.
2 changes: 1 addition & 1 deletion 8Knot/cache_manager/cache_facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def retrieve_from_cache(
"""
SELECT *
FROM {tablename} t
WHERE t.id IN %s;
WHERE t.repo_id IN %s;
""".format(
tablename=tablename
),
Expand Down
60 changes: 30 additions & 30 deletions 8Knot/cache_manager/db_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,10 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS commits_query(
id int,
commits text, -- this is the commit hash, so it's base64 hash.
repo_id int,
commit_hash text, -- this is the commit hash, so it's base64 hash.
author_email text,
date text,
author_date text,
author_timestamp text,
committer_timestamp text)
"""
Expand All @@ -134,15 +134,15 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS issues_query(
id int,
repo_id int,
repo_name text,
issue int,
issue_number int,
gh_issue int,
reporter_id text,
issue_closer text,
created text,
closed text
created_at text,
closed_at text
)
"""
)
Expand All @@ -151,14 +151,14 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS prs_query(
id int,
repo_id int,
repo_name text,
pull_request int,
pull_request_id int,
pr_src_number int,
cntrb_id text,
created text,
closed text,
merged text
created_at text,
closed_at text,
merged_at text
)
"""
)
Expand All @@ -168,8 +168,8 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS affiliation_query(
cntrb_id text,
created text,
id int,
created_at text,
repo_id int,
login text,
action text,
rank int,
Expand All @@ -183,7 +183,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS contributors_query(
id int,
repo_id int,
repo_name text,
cntrb_id text,
created_at text,
Expand All @@ -199,9 +199,9 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS issue_assignee_query(
issue_id text,
id int,
created text,
closed text,
repo_id int,
created_at text,
closed_at text,
assign_date text,
assignment_action text,
assignee text
Expand All @@ -214,9 +214,9 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS pr_assignee_query(
pull_request_id int,
id int,
created text,
closed text,
repo_id int,
created_at text,
closed_at text,
assign_date text,
assignment_action text,
assignee text
Expand All @@ -229,7 +229,7 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS cntrb_per_file_query(
file_path text,
id int,
repo_id int,
cntrb_ids text
)
"""
Expand All @@ -240,8 +240,8 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS pr_file_query(
file_path text,
pull_request int,
id int
pull_request_id int,
repo_id int
)
"""
)
Expand All @@ -250,7 +250,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_files_query(
id int,
repo_id int,
repo_name text,
repo_path text,
rl_analysis_date text,
Expand All @@ -264,7 +264,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_languages_query(
id int,
repo_id int,
programming_language text,
code_lines int,
files int
Expand All @@ -276,7 +276,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS package_version_query(
id int,
repo_id int,
name text,
current_release_date text,
latest_release_date text,
Expand All @@ -289,7 +289,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_releases_query(
id int,
repo_id int,
release_name text,
release_created_at text,
release_published_at text,
Expand All @@ -302,7 +302,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS ossf_score_query(
id int,
repo_id int,
name text,
score float4
)
Expand All @@ -313,7 +313,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_info_query(
id int,
repo_id int,
issues_enabled text,
fork_count int,
watchers_count int,
Expand All @@ -331,7 +331,7 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS pr_response_query(
pull_request_id int,
ID int,
repo_id int,
cntrb_id text,
msg_timestamp text,
msg_cntrb_id text,
Expand Down
8 changes: 4 additions & 4 deletions 8Knot/pages/affiliation/visualizations/gh_org_affiliation.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,16 +168,16 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
requiring no further processing."""

# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created", axis=0, ascending=True)
df = df.sort_values(by="created_at", axis=0, ascending=True)

# filter values based on date picker
if start_date is not None:
df = df[df.created >= start_date]
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created <= end_date]
df = df[df.created_at <= end_date]

# intital count of same company name in github profile
result = df.cntrb_company.value_counts(dropna=False)
Expand Down
18 changes: 12 additions & 6 deletions 8Knot/pages/affiliation/visualizations/org_associated_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,14 @@
dbc.Checklist(
id=f"email-filter-{PAGE}-{VIZ_ID}",
options=[
{"label": "Exclude Gmail", "value": "gmail"},
{"label": "Exclude GitHub", "value": "github"},
{
"label": "Exclude Gmail",
"value": "gmail",
},
{
"label": "Exclude GitHub",
"value": "github",
},
],
value=[""],
inline=True,
Expand Down Expand Up @@ -201,16 +207,16 @@ def org_associated_activity_graph(repolist, num, start_date, end_date, email_fil

def process_data(df: pd.DataFrame, num, start_date, end_date, email_filter):
# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created", axis=0, ascending=True)
df = df.sort_values(by="created_at", axis=0, ascending=True)

# filter values based on date picker
if start_date is not None:
df = df[df.created >= start_date]
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created <= end_date]
df = df[df.created_at <= end_date]

# creates list of emails for each contribution and flattens list result
emails = df.email_list.str.split(" , ").explode("email_list").tolist()
Expand Down
30 changes: 21 additions & 9 deletions 8Knot/pages/affiliation/visualizations/org_core_contributors.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,14 @@
dbc.Checklist(
id=f"email-filter-{PAGE}-{VIZ_ID}",
options=[
{"label": "Exclude Gmail", "value": "gmail"},
{"label": "Exclude GitHub", "value": "github"},
{
"label": "Exclude Gmail",
"value": "gmail",
},
{
"label": "Exclude GitHub",
"value": "github",
},
],
value=[""],
inline=True,
Expand Down Expand Up @@ -165,7 +171,13 @@ def toggle_popover(n, is_open):
background=True,
)
def compay_associated_activity_graph(
repolist, contributions, contributors, start_date, end_date, email_filter, bot_switch
repolist,
contributions,
contributors,
start_date,
end_date,
email_filter,
bot_switch,
):
# wait for data to asynchronously download and become available.
while not_cached := cf.get_uncached(func_name=aq.__name__, repolist=repolist):
Expand Down Expand Up @@ -201,23 +213,23 @@ def compay_associated_activity_graph(

def process_data(df: pd.DataFrame, contributions, contributors, start_date, end_date, email_filter):
# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created", axis=0, ascending=True)
df = df.sort_values(by="created_at", axis=0, ascending=True)

# filter values based on date picker
if start_date is not None:
df = df[df.created >= start_date]
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created <= end_date]
df = df[df.created_at <= end_date]

# groups contributions by countributor id and counts, created column now hold the number
# of contributions for its respective contributor
df = df.groupby(["cntrb_id", "email_list"], as_index=False)[["created"]].count()
df = df.groupby(["cntrb_id", "email_list"], as_index=False)[["created_at"]].count()

# filters out contributors that dont meet the core contribution threshhold
df = df[df.created >= contributions]
df = df[df.created_at >= contributions]

# creates list of unique emails and flattens list result
emails = df.email_list.str.split(" , ").explode("email_list").tolist()
Expand Down
8 changes: 4 additions & 4 deletions 8Knot/pages/affiliation/visualizations/unqiue_domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,16 +165,16 @@ def unique_domains_graph(repolist, num, start_date, end_date, bot_switch):

def process_data(df: pd.DataFrame, num, start_date, end_date):
# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created", axis=0, ascending=True)
df = df.sort_values(by="created_at", axis=0, ascending=True)

# filter values based on date picker
if start_date is not None:
df = df[df.created >= start_date]
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created <= end_date]
df = df[df.created_at <= end_date]

# creates list of unique emails and flattens list result
emails = df.email_list.str.split(" , ").explode("email_list").unique().tolist()
Expand Down
16 changes: 10 additions & 6 deletions 8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def directory_dropdown(repo_id):
# strings to hold the values for each column (always the same for every row of this query)
repo_name = df["repo_name"].iloc[0]
repo_path = df["repo_path"].iloc[0]
repo_id = str(df["id"].iloc[0])
repo_id = str(df["repo_id"].iloc[0])

# pattern found in each file path, used to slice to get only the root file path
path_slice = repo_id + "-" + repo_path + "/" + repo_name + "/"
Expand All @@ -192,7 +192,7 @@ def directory_dropdown(repo_id):
# drop unneccessary columns not needed after preprocessing steps
df = df.reset_index()
df.drop(
["index", "id", "repo_name", "repo_path", "rl_analysis_date"],
["index", "repo_id", "repo_name", "repo_path", "rl_analysis_date"],
axis=1,
inplace=True,
)
Expand Down Expand Up @@ -308,7 +308,7 @@ def process_data(
# strings to hold the values for each column (always the same for every row of this query)
repo_name = df_file["repo_name"].iloc[0]
repo_path = df_file["repo_path"].iloc[0]
repo_id = str(df_file["id"].iloc[0])
repo_id = str(df_file["repo_id"].iloc[0])

# pattern found in each file path, used to slice to get only the root file path
path_slice = repo_id + "-" + repo_path + "/" + repo_name + "/"
Expand All @@ -322,8 +322,8 @@ def process_data(
df_file = df_file.join(df_file["file_path"].str.split("/", expand=True))

# drop unnecessary columns
df_file.drop(["id"], axis=1, inplace=True)
df_file_cntbs.drop(["id"], axis=1, inplace=True)
df_file.drop(["repo_id"], axis=1, inplace=True)
df_file_cntbs.drop(["repo_id"], axis=1, inplace=True)

# Left join on df_files to only get the files that are currently in the repository
# and the contributors that have ever opened a pr that included edits on the file
Expand Down Expand Up @@ -387,7 +387,11 @@ def process_data(

# drop unneccessary columns not needed after preprocessing steps
df_actions = df_actions.reset_index()
df_actions.drop(["index", "id", "repo_name", "login", "Action", "rank"], axis=1, inplace=True)
df_actions.drop(
["index", "repo_id", "repo_name", "login", "Action", "rank"],
axis=1,
inplace=True,
)

# dictionary of cntrb_ids and their most recent activity on repo
last_contrb = df_actions.set_index("cntrb_id")["created_at"].to_dict()
Expand Down
Loading

0 comments on commit 0dc26d9

Please sign in to comment.