harvard-edge · profvjreddi · May 18, 2024 · May 18, 2024
diff --git a/contents/data_engineering/data_engineering.bib b/contents/data_engineering/data_engineering.bib
@@ -79,6 +79,19 @@ @article{gebru2021datasheets
  month = nov,
 }
 
+@inproceedings{Data_Cascades_2021,
+ author = {Sambasivan, Nithya and Kapania, Shivani and Highfill, Hannah and Akrong, Diana and Paritosh, Praveen and Aroyo, Lora M},
+ title = {{{\textquotedblleft}Everyone} wants to do the model work, not the data work{\textquotedblright}: {Data} Cascades in High-Stakes {AI}},
+ booktitle = {Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems},
+ pages = {1--15},
+ year = {2021},
+ doi = {10.1145/3411764.3445518},
+ source = {Crossref},
+ url = {https://doi.org/10.1145/3411764.3445518},
+ publisher = {ACM},
+ month = may,
+}
+
 @misc{googleinformation,
  author = {Google},
  bdsk-url-1 = {https://blog.google/documents/83/},

diff --git a/contents/data_engineering/data_engineering.qmd b/contents/data_engineering/data_engineering.qmd
@@ -46,7 +46,7 @@ We begin by discussing data collection: Where do we source data, and how do we g
 
 ## Problem Definition
 
-In many machine learning domains, sophisticated algorithms take center stage, while the fundamental importance of data quality is often overlooked. This neglect gives rise to ["Data Cascades"](https://research.google/pubs/pub49953/) (see @fig-cascades)—events where lapses in data quality compound, leading to negative downstream consequences such as flawed predictions, project terminations, and even potential harm to communities. In @fig-cascades, we have an illustration of potential data pitfalls at every stage and how they influence the entire process down the line. The influence of data collection errors is especially pronounced. Any lapses in this stage will become apparent at later stages (in model evaluation and deployment) and might lead to costly consequences, such as abandoning the entire model and restarting anew. Therefore, investing in data engineering techniques from the onset will help us detect errors early.
+In many machine learning domains, sophisticated algorithms take center stage, while the fundamental importance of data quality is often overlooked. This neglect gives rise to ["Data Cascades"](https://research.google/pubs/pub49953/) by (see @fig-cascades)—events where lapses in data quality compound, leading to negative downstream consequences such as flawed predictions, project terminations, and even potential harm to communities. In @fig-cascades, we have an illustration of potential data pitfalls at every stage and how they influence the entire process down the line. The influence of data collection errors is especially pronounced. Any lapses in this stage will become apparent at later stages (in model evaluation and deployment) and might lead to costly consequences, such as abandoning the entire model and restarting anew. Therefore, investing in data engineering techniques from the onset will help us detect errors early.
 
 ![Data cascades: compounded costs. Credit: @Data_Cascades_2021.](images/png/data_engineering_cascades.png){#fig-cascades}