Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: add hudi core API docs with examples #113

Merged
merged 3 commits into from
Sep 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions crates/core/src/config/internal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
//! Hudi internal configurations.

use std::collections::HashMap;
use std::str::FromStr;
Expand All @@ -25,6 +26,21 @@ use strum_macros::EnumIter;

use crate::config::{ConfigParser, HudiConfigValue};

/// Configurations for internal use.
///
/// **Example**
///
/// ```rust
/// use url::Url;
/// use hudi_core::config::HudiConfigValue;
/// use hudi_core::config::internal::HudiInternalConfig::SkipConfigValidation;
/// use hudi_core::table::Table as HudiTable;
///
/// let options = vec![(SkipConfigValidation.as_ref(), HudiConfigValue::Boolean(true))];
/// let base_uri = Url::from_file_path("/tmp/hudi_data").unwrap();
/// HudiTable::new_with_options(base_uri.as_ref(), options);
/// ```
///
#[derive(Clone, Debug, PartialEq, Eq, Hash, EnumIter)]
pub enum HudiInternalConfig {
SkipConfigValidation,
Expand Down
23 changes: 23 additions & 0 deletions crates/core/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
//! Hudi Configurations.
use std::any::type_name;
use std::collections::HashMap;
use std::sync::Arc;
Expand All @@ -28,15 +29,20 @@ pub mod table;

pub const HUDI_CONF_DIR: &str = "HUDI_CONF_DIR";

/// This defines some common APIs for working with configurations in Hudi.
pub trait ConfigParser: AsRef<str> {
/// Configuration value type.
type Output;

/// Supplies the default value of the configuration.
fn default_value(&self) -> Option<Self::Output>;

/// To indicate if the configuration is required or not, this helps in validation.
fn is_required(&self) -> bool {
false
}

/// Validate the configuration by parsing the given [String] value and check if it is required.
fn validate(&self, configs: &HashMap<String, String>) -> Result<()> {
match self.parse_value(configs) {
Ok(_) => Ok(()),
Expand All @@ -51,8 +57,12 @@ pub trait ConfigParser: AsRef<str> {
}
}

/// Parse the [String] value to [Self::Output].
fn parse_value(&self, configs: &HashMap<String, String>) -> Result<Self::Output>;

/// Parse the [String] value to [Self::Output], or return the default value.
///
/// Panic if the default value is not defined.
fn parse_value_or_default(&self, configs: &HashMap<String, String>) -> Self::Output {
self.parse_value(configs).unwrap_or_else(|_| {
self.default_value()
Expand All @@ -61,6 +71,7 @@ pub trait ConfigParser: AsRef<str> {
}
}

/// All possible data types for Hudi Configuration values.
#[derive(Clone, Debug)]
pub enum HudiConfigValue {
Boolean(bool),
Expand All @@ -71,6 +82,13 @@ pub enum HudiConfigValue {
}

impl HudiConfigValue {
/// Covert [HudiConfigValue] logical type to the representing data type in Rust.
///
/// - [`HudiConfigValue::Boolean`] -> [bool]
/// - [`HudiConfigValue::Integer`] -> [isize]
/// - [`HudiConfigValue::UInteger`] -> [usize]
/// - [`HudiConfigValue::String`] -> [String]
/// - [`HudiConfigValue::List`] -> [`Vec<String>`]
pub fn to<T: 'static + std::fmt::Debug + From<HudiConfigValue>>(self) -> T {
T::from(self)
}
Expand Down Expand Up @@ -124,18 +142,21 @@ impl From<HudiConfigValue> for Vec<String> {
}
}

/// Hudi configuration container.
#[derive(Clone, Debug)]
pub struct HudiConfigs {
pub raw_configs: Arc<HashMap<String, String>>,
}

impl HudiConfigs {
/// Create [HudiConfigs] with key-value pairs of [String]s.
pub fn new(raw_configs: HashMap<String, String>) -> Self {
Self {
raw_configs: Arc::new(raw_configs),
}
}

/// Create empty [HudiConfigs].
pub fn empty() -> Self {
Self {
raw_configs: Arc::new(HashMap::new()),
Expand All @@ -153,13 +174,15 @@ impl HudiConfigs {
parser.parse_value(&self.raw_configs)
}

/// Get value or default value. If the config has no default value, this will panic.
pub fn get_or_default(
&self,
parser: impl ConfigParser<Output = HudiConfigValue>,
) -> HudiConfigValue {
parser.parse_value_or_default(&self.raw_configs)
}

/// Get value or default value. If the config has no default value, this will return [None].
pub fn try_get(
&self,
parser: impl ConfigParser<Output = HudiConfigValue>,
Expand Down
24 changes: 24 additions & 0 deletions crates/core/src/config/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
//! Hudi read configurations.

use std::collections::HashMap;
use std::str::FromStr;
Expand All @@ -24,9 +25,32 @@ use crate::config::{ConfigParser, HudiConfigValue};
use anyhow::{anyhow, Result};
use strum_macros::EnumIter;

/// Configurations for reading Hudi tables.
///
/// **Example**
///
/// ```rust
/// use url::Url;
/// use hudi_core::config::read::HudiReadConfig::{AsOfTimestamp, InputPartitions};
/// use hudi_core::table::Table as HudiTable;
///
/// let options = vec![(InputPartitions.as_ref(), "2"),
/// (AsOfTimestamp.as_ref(), "20240101010100000")];
/// let base_uri = Url::from_file_path("/tmp/hudi_data").unwrap();
/// HudiTable::new_with_options(base_uri.as_ref(), options);
/// ```
///
#[derive(Clone, Debug, PartialEq, Eq, Hash, EnumIter)]
pub enum HudiReadConfig {
/// Define input splits
/// - Hoodie Key : hoodie.read.input.partitions
///
/// If has 100 files, [InputPartitions] is 5, will product 5 chunk,
/// every iter or task process 20 files
InputPartitions,

/// The query instant for time travel. Without specified this option, we query the latest snapshot.
/// - Hoodie Key : hoodie.read.as.of.timestamp
AsOfTimestamp,
}

Expand Down
58 changes: 58 additions & 0 deletions crates/core/src/config/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
//! Hudi table configurations.

use std::collections::HashMap;
use std::str::FromStr;
Expand All @@ -26,22 +27,77 @@ use strum_macros::{AsRefStr, EnumIter};

use crate::config::{ConfigParser, HudiConfigValue};

/// Configurations for Hudi tables, persisted in `hoodie.properties`.
///
/// **Example**
///
/// ```rust
/// use url::Url;
/// use hudi_core::config::table::HudiTableConfig::BaseFileFormat;
/// use hudi_core::table::Table as HudiTable;
///
/// let options = vec![(BaseFileFormat.as_ref(), "parquet")];
/// let base_uri = Url::from_file_path("/tmp/hudi_data").unwrap();
/// HudiTable::new_with_options(base_uri.as_ref(), options);
/// ```
#[derive(Clone, Debug, PartialEq, Eq, Hash, EnumIter)]
pub enum HudiTableConfig {
/// Base file format
///
/// Currently only parquet is supported.
BaseFileFormat,

/// Table checksum is used to guard against partial writes in HDFS.
/// It is added as the last entry in hoodie.properties and then used to validate while reading table config.
Checksum,

/// Database name that will be used for incremental query.
/// If different databases have the same table name during incremental query,
/// we can set it to limit the table name under a specific database
DatabaseName,

/// When set to true, will not write the partition columns into hudi. By default, false.
DropsPartitionFields,

/// Flag to indicate whether to use Hive style partitioning.
/// If set true, the names of partition folders follow <partition_column_name>=<partition_value> format.
/// By default false (the names of partition folders are only partition values)
IsHiveStylePartitioning,

/// Should we url encode the partition path value, before creating the folder structure.
IsPartitionPathUrlencoded,

/// Key Generator class property for the hoodie table
KeyGeneratorClass,

/// Fields used to partition the table. Concatenated values of these fields are used as
/// the partition path, by invoking toString().
/// These fields also include the partition type which is used by custom key generators
PartitionFields,

/// Field used in preCombining before actual write. By default, when two records have the same key value,
/// the largest value for the precombine field determined by Object.compareTo(..), is picked.
PrecombineField,

/// When enabled, populates all meta fields. When disabled, no meta fields are populated
/// and incremental queries will not be functional. This is only meant to be used for append only/immutable data for batch processing
PopulatesMetaFields,

/// Columns used to uniquely identify the table.
/// Concatenated values of these fields are used as the record key component of HoodieKey.
RecordKeyFields,

/// Table name that will be used for registering with Hive. Needs to be same across runs.
TableName,

/// The table type for the underlying data, for this write. This can’t change between writes.
TableType,

/// Version of table, used for running upgrade/downgrade steps between releases with potentially
/// breaking/backwards compatible changes.
TableVersion,

/// Version of timeline used, by the table.
TimelineLayoutVersion,
}

Expand Down Expand Up @@ -129,6 +185,7 @@ impl ConfigParser for HudiTableConfig {
}
}

/// Config value for [HudiTableConfig::TableType].
#[derive(Clone, Debug, PartialEq, AsRefStr)]
pub enum TableTypeValue {
#[strum(serialize = "COPY_ON_WRITE")]
Expand All @@ -149,6 +206,7 @@ impl FromStr for TableTypeValue {
}
}

/// Config value for [HudiTableConfig::BaseFileFormat].
#[derive(Clone, Debug, PartialEq, AsRefStr)]
pub enum BaseFileFormatValue {
#[strum(serialize = "parquet")]
Expand Down
15 changes: 15 additions & 0 deletions crates/core/src/file_group/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
* specific language governing permissions and limitations
* under the License.
*/
//! This module is for File Group related models and APIs.
//!
//! A set of data/base files + set of log files, that make up a unit for all operations.

use std::collections::BTreeMap;
use std::fmt;
Expand All @@ -29,15 +32,21 @@ use crate::storage::file_info::FileInfo;
use crate::storage::file_stats::FileStats;
use crate::storage::Storage;

/// Represents common metadata about a Hudi Base File.
#[derive(Clone, Debug)]
pub struct BaseFile {
/// The file group id that is unique across the table.
pub file_group_id: String,

pub commit_time: String,

pub info: FileInfo,

pub stats: Option<FileStats>,
}

impl BaseFile {
/// Parse file name and extract file_group_id and commit_time.
fn parse_file_name(file_name: &str) -> Result<(String, String)> {
let err_msg = format!("Failed to parse file name '{}' for base file.", file_name);
let (name, _) = file_name.rsplit_once('.').ok_or(anyhow!(err_msg.clone()))?;
Expand All @@ -47,6 +56,7 @@ impl BaseFile {
Ok((file_group_id, commit_time))
}

/// Construct [BaseFile] with the base file name.
pub fn from_file_name(file_name: &str) -> Result<Self> {
let (file_group_id, commit_time) = Self::parse_file_name(file_name)?;
Ok(Self {
Expand All @@ -57,6 +67,7 @@ impl BaseFile {
})
}

/// Construct [BaseFile] with the [FileInfo].
pub fn from_file_info(info: FileInfo) -> Result<Self> {
let (file_group_id, commit_time) = Self::parse_file_name(&info.name)?;
Ok(Self {
Expand All @@ -68,6 +79,10 @@ impl BaseFile {
}
}

/// Within a file group, a slice is a combination of data file written at a commit time and list of log files,
/// containing changes to the data file from that commit time.
///
/// [note] The log files are not yet supported.
#[derive(Clone, Debug)]
pub struct FileSlice {
pub base_file: BaseFile,
Expand Down
31 changes: 31 additions & 0 deletions crates/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,37 @@
* specific language governing permissions and limitations
* under the License.
*/
//! Crate `hudi-core`.
//!
//! # The [config] module is responsible for managing configurations.
//!
//! **Example**
//!
//! ```rust
//! use url::Url;
//! use hudi_core::config::read::HudiReadConfig::{AsOfTimestamp, InputPartitions};
//! use hudi_core::table::Table as HudiTable;
//!
//! let options = vec![(InputPartitions.as_ref(), "2"),
//! (AsOfTimestamp.as_ref(), "20240101010100000")];
//! let base_uri = Url::from_file_path("/tmp/hudi_data").unwrap();
//! HudiTable::new_with_options(base_uri.as_ref(), options);
//! ```
//!
//! # The [table] module is responsible for managing Hudi tables.
//!
//! **Example**
//!
//! create hudi table
//! ```rust
//! use url::Url;
//! use hudi_core::table::Table;
//!
//! pub async fn test() {
//! let base_uri = Url::from_file_path("/tmp/hudi_data").unwrap();
//! let hudi_table = Table::new(base_uri.path()).await.unwrap();
//! }
//! ```

pub mod config;
pub mod file_group;
Expand Down
10 changes: 10 additions & 0 deletions crates/core/src/storage/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
//! This module is responsible for interacting with the storage layer.

use std::collections::HashMap;
use std::path::PathBuf;
Expand Down Expand Up @@ -185,6 +186,15 @@ impl Storage {
}
}

/// Get relative paths of leaf directories under a given directory.
///
/// **Example**
/// - /usr/hudi/table_name
/// - /usr/hudi/table_name/.hoodie
/// - /usr/hudi/table_name/dt=2024/month=01/day=01
/// - /usr/hudi/table_name/dt=2025/month=02
///
/// the result is \[".hoodie", "dt=2024/mont=01/day=01", "dt=2025/month=02"\]
#[async_recursion]
pub async fn get_leaf_dirs(storage: &Storage, subdir: Option<&str>) -> Result<Vec<String>> {
let mut leaf_dirs = Vec::new();
Expand Down
Loading
Loading