-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataEngineering_Lab01.sql
139 lines (111 loc) · 3.82 KB
/
DataEngineering_Lab01.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
SELECT * FROM public.player_seasons;
--
CREATE TYPE season_stats as(
season Integer,
pts REAL,
ast REAL,
reb REAL,
weight INTEGER
)
CREATE TYPE scoring_class AS ENUM ('star', 'good', 'average', 'bad')
-- drop type season_stats
CREATE TABLE players(
player_name TEXT,
height TEXT,
college TEXT,
country TEXT,
draft_year TEXT,
draft_number TEXT,
season_stats season_stats[],
scoring_class scoring_class,
years_since_last_Season INTEGER,
current_season Integer,
PRIMARY KEY(player_name, current_season)
);
-- drop table players;
select * from players
-- Increamentally building the cumulative table for the players and along the way hoding on to the historical data. This happens
-- to be the most powerful and useful modeling techniques which would helps easily flatten a table and unnest according to our requirements.
/*
Why Cumulative Table Design?
Cost Optimization: In the cloud, unnecessary I/O and compute costs pile up quickly.
CTD helps us reduce these by storing pre-aggregated, cumulative snapshots of data.
Consistency Across Projects: By embedding this design into our internal DE/DS tool (CLUPA), we ensure consistent implementation across
teams, reduce the risk of data leakage, and simplify adoption.
*/
INSERT INTO players
WITH yesterday as (
select * from players
where current_season = 2000
),
today as (
select * from player_seasons
where season = 2001
)
-- Seed query as in
select
coalesce(t.player_name, y.player_name) as player_name,
coalesce(t.height, y.height) as height,
coalesce(t.college, y.college) as college,
coalesce(t.country, y.country) as country,
coalesce(t.draft_year, y.draft_year) as draft_year,
coalesce(t.draft_number, y.draft_number) as draft_number,
case when y.season_stats is null
then array[row(
t.season,
t.pts,
t.ast,
t.reb,
t.weight
)::season_stats] -- casting this to the pre-defined datatype that is struct converting the whole row into array of elements.
when t.season is not null
then y.season_stats || array[row(
t.season,
t.pts,
t.ast,
t.reb,
t.weight
)::season_stats]
else y.season_stats
END as season_stats,
case
when t.season is not null then
case when t.pts > 20 then 'star'
when t.pts > 15 then 'good'
when t.pts > 10 then 'average'
else 'bad'
end::scoring_class
else y.scoring_class
end as scoring_class,
case when t.season is not null then 0
else y.years_since_last_season + 1
end as years_since_last_season,
coalesce (t.season, y.current_season + 1) as current_season
from today t FULL OUTER JOIN yesterday y on t.player_name = y.player_name;
-- Unnest the struct elements that is to seperate out the elements of an array into columns.
with unnested as(
select player_name,
unnest(season_stats)::season_stats as season_stats
from players
where current_season = 2001
and player_name = 'Michael Jordan')
select
player_name, (season_stats::season_stats).* from unnested
/* This is query doesn't have a group by but it seems like we would normally have a group by. You see we would have used min() and max()
and use group by to for non-aggregate elements in the query which would eventually slow dow the query execution time.
The query below is insanely fast with the help of cumulative table design approach, with no group bys and shuffuling around.
*/
select
player_name,
(season_stats[CARDINALITY(season_stats)]::season_stats).pts /
case
when (season_stats[1]::season_stats).pts = 0
then 1
else (season_stats[1]::season_stats).pts
end as improvement
from players
where current_season = 2001
and scoring_class = 'star'
order by 2 desc
select * from players
where player_name = 'Don MacLean'