diff --git a/crates/ruff_python_formatter/src/string/normalize.rs b/crates/ruff_python_formatter/src/string/normalize.rs index 6836ad80828ae2..5e5706a38f769f 100644 --- a/crates/ruff_python_formatter/src/string/normalize.rs +++ b/crates/ruff_python_formatter/src/string/normalize.rs @@ -14,7 +14,7 @@ use crate::QuoteStyle; pub(crate) struct StringNormalizer<'a, 'src> { quoting: Quoting, - preferred_quote_style: QuoteStyle, + preferred_quote_style: Option, context: &'a PyFormatContext<'src>, } @@ -22,13 +22,13 @@ impl<'a, 'src> StringNormalizer<'a, 'src> { pub(crate) fn from_context(context: &'a PyFormatContext<'src>) -> Self { Self { quoting: Quoting::default(), - preferred_quote_style: context.options().quote_style(), + preferred_quote_style: None, context, } } pub(crate) fn with_preferred_quote_style(mut self, quote_style: QuoteStyle) -> Self { - self.preferred_quote_style = quote_style; + self.preferred_quote_style = Some(quote_style); self } @@ -38,7 +38,9 @@ impl<'a, 'src> StringNormalizer<'a, 'src> { } fn quoting(&self, string: StringLikePart) -> Quoting { - if let FStringState::InsideExpressionElement(context) = self.context.f_string_state() { + match (self.quoting, self.context.f_string_state()) { + (Quoting::Preserve, _) => Quoting::Preserve, + // If we're inside an f-string, we need to make sure to preserve the // existing quotes unless we're inside a triple-quoted f-string and // the inner string itself isn't triple-quoted. For example: @@ -53,32 +55,36 @@ impl<'a, 'src> StringNormalizer<'a, 'src> { // The reason to preserve the quotes is based on the assumption that // the original f-string is valid in terms of quoting, and we don't // want to change that to make it invalid. - if (context.f_string().flags().is_triple_quoted() && !string.flags().is_triple_quoted()) - || self.context.options().target_version().supports_pep_701() - { - self.quoting - } else { - Quoting::Preserve + (Quoting::CanChange, FStringState::InsideExpressionElement(context)) => { + if (context.f_string().flags().is_triple_quoted() + && !string.flags().is_triple_quoted()) + || self.context.options().target_version().supports_pep_701() + { + Quoting::CanChange + } else { + Quoting::Preserve + } } - } else { - self.quoting + + (Quoting::CanChange, _) => Quoting::CanChange, } } - /// Computes the strings preferred quotes. - pub(crate) fn choose_quotes(&self, string: StringLikePart) -> QuoteSelection { - let raw_content = self.context.locator().slice(string.content_range()); - let first_quote_or_normalized_char_offset = raw_content - .bytes() - .position(|b| matches!(b, b'\\' | b'"' | b'\'' | b'\r' | b'{')); - let string_flags = string.flags(); - - let new_kind = match self.quoting(string) { - Quoting::Preserve => string_flags, + /// Determines the preferred quote style for `string`. + /// The formatter should use the preferred quote style unless + /// it can't because the string contains the preferred quotes OR + /// it leads to more escaping. + pub(super) fn preferred_quote_style(&self, string: StringLikePart) -> QuoteStyle { + match self.quoting(string) { + Quoting::Preserve => QuoteStyle::Preserve, Quoting::CanChange => { + let preferred_quote_style = self + .preferred_quote_style + .unwrap_or(self.context.options().quote_style()); + // Per PEP 8, always prefer double quotes for triple-quoted strings. // Except when using quote-style-preserve. - let preferred_style = if string_flags.is_triple_quoted() { + if string.flags().is_triple_quoted() { // ... unless we're formatting a code snippet inside a docstring, // then we specifically want to invert our quote style to avoid // writing out invalid Python. @@ -126,39 +132,48 @@ impl<'a, 'src> StringNormalizer<'a, 'src> { // if it doesn't have perfect alignment with PEP8. if let Some(quote) = self.context.docstring() { QuoteStyle::from(quote.opposite()) - } else if self.preferred_quote_style.is_preserve() { + } else if preferred_quote_style.is_preserve() { QuoteStyle::Preserve } else { QuoteStyle::Double } } else { - self.preferred_quote_style - }; - - if let Ok(preferred_quote) = Quote::try_from(preferred_style) { - if let Some(first_quote_or_normalized_char_offset) = - first_quote_or_normalized_char_offset - { - if string_flags.is_raw_string() { - choose_quotes_for_raw_string( - &raw_content[first_quote_or_normalized_char_offset..], - string_flags, - preferred_quote, - ) - } else { - choose_quotes_impl( - &raw_content[first_quote_or_normalized_char_offset..], - string_flags, - preferred_quote, - ) - } - } else { - string_flags.with_quote_style(preferred_quote) - } - } else { - string_flags + preferred_quote_style } } + } + } + + /// Computes the strings preferred quotes. + pub(crate) fn choose_quotes(&self, string: StringLikePart) -> QuoteSelection { + let raw_content = self.context.locator().slice(string.content_range()); + let first_quote_or_normalized_char_offset = raw_content + .bytes() + .position(|b| matches!(b, b'\\' | b'"' | b'\'' | b'\r' | b'{')); + let string_flags = string.flags(); + let preferred_style = self.preferred_quote_style(string); + + let new_kind = match ( + Quote::try_from(preferred_style), + first_quote_or_normalized_char_offset, + ) { + // The string contains no quotes so it's safe to use the preferred quote style + (Ok(preferred_quote), None) => string_flags.with_quote_style(preferred_quote), + + // The preferred quote style is single or double quotes, and the string contains a quote or + // another character that may require escaping + (Ok(preferred_quote), Some(first_quote_or_normalized_char_offset)) => { + let quote = QuoteMetadata::from_str( + &raw_content[first_quote_or_normalized_char_offset..], + string.flags(), + preferred_quote, + ) + .choose(preferred_quote); + string_flags.with_quote_style(quote) + } + + // The preferred quote style is to preserve the quotes, so let's do that. + (Err(()), _) => string_flags, }; QuoteSelection { @@ -209,119 +224,93 @@ impl QuoteSelection { } } -#[derive(Debug)] -pub(crate) struct NormalizedString<'a> { - /// Holds data about the quotes and prefix of the string - flags: AnyStringFlags, - - /// The range of the string's content in the source (minus prefix and quotes). - content_range: TextRange, +#[derive(Clone, Debug)] +pub(crate) struct QuoteMetadata { + kind: QuoteMetadataKind, - /// The normalized text - text: Cow<'a, str>, + /// The quote style in the source. + source_style: Quote, } -impl<'a> NormalizedString<'a> { - pub(crate) fn text(&self) -> &Cow<'a, str> { - &self.text - } - - pub(crate) fn flags(&self) -> AnyStringFlags { - self.flags - } -} +/// Tracks information about the used quotes in a string which is used +/// to choose the quotes for a part. +impl QuoteMetadata { + pub(crate) fn from_str(text: &str, flags: AnyStringFlags, preferred_quote: Quote) -> Self { + let kind = if flags.is_raw_string() { + QuoteMetadataKind::raw(text, preferred_quote, flags.is_triple_quoted()) + } else if flags.is_triple_quoted() { + QuoteMetadataKind::triple_quoted(text, preferred_quote) + } else { + QuoteMetadataKind::regular(text) + }; -impl Ranged for NormalizedString<'_> { - fn range(&self) -> TextRange { - self.content_range + Self { + kind, + source_style: flags.quote_style(), + } } -} -impl Format> for NormalizedString<'_> { - fn fmt(&self, f: &mut Formatter>) -> FormatResult<()> { - let quotes = StringQuotes::from(self.flags); - ruff_formatter::write!(f, [self.flags.prefix(), quotes])?; - match &self.text { - Cow::Borrowed(_) => { - source_text_slice(self.range()).fmt(f)?; + pub(super) fn choose(&self, preferred_quote: Quote) -> Quote { + match self.kind { + QuoteMetadataKind::Raw { contains_preferred } => { + if contains_preferred { + self.source_style + } else { + preferred_quote + } } - Cow::Owned(normalized) => { - text(normalized).fmt(f)?; + QuoteMetadataKind::Triple { contains_preferred } => { + if contains_preferred { + self.source_style + } else { + preferred_quote + } } + QuoteMetadataKind::Regular { + single_quotes, + double_quotes, + } => match single_quotes.cmp(&double_quotes) { + Ordering::Less => Quote::Single, + Ordering::Equal => preferred_quote, + Ordering::Greater => Quote::Double, + }, } - quotes.fmt(f) } } -/// Choose the appropriate quote style for a raw string. -/// -/// The preferred quote style is chosen unless the string contains unescaped quotes of the -/// preferred style. For example, `r"foo"` is chosen over `r'foo'` if the preferred quote -/// style is double quotes. -fn choose_quotes_for_raw_string( - input: &str, - flags: AnyStringFlags, - preferred_quote: Quote, -) -> AnyStringFlags { - let preferred_quote_char = preferred_quote.as_char(); - let mut chars = input.chars().peekable(); - let contains_unescaped_configured_quotes = loop { - match chars.next() { - Some('\\') => { - // Ignore escaped characters - chars.next(); - } - // `"` or `'` - Some(c) if c == preferred_quote_char => { - if !flags.is_triple_quoted() { - break true; - } +#[derive(Copy, Clone, Debug)] +enum QuoteMetadataKind { + /// A raw string. + /// + /// For raw strings it's only possible to change the quotes if the preferred quote style + /// isn't used inside the string. + Raw { contains_preferred: bool }, - match chars.peek() { - // We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser - // about where the closing triple quotes start - None => break true, - Some(next) if *next == preferred_quote_char => { - // `""` or `''` - chars.next(); + /// Regular (non raw) triple quoted string. + /// + /// For triple quoted strings it's only possible to change the quotes if no + /// triple of the preferred quotes is used inside the string. + Triple { contains_preferred: bool }, - // We can't turn `r'''""'''` into `r""""""""`, nor can we have - // `"""` or `'''` respectively inside the string - if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char) { - break true; - } - } - _ => {} - } - } - Some(_) => continue, - None => break false, - } - }; - if contains_unescaped_configured_quotes { - flags - } else { - flags.with_quote_style(preferred_quote) - } + /// A single quoted string that uses either double or single quotes. + /// + /// For regular strings it's desired to pick the quote style that requires the least escaping. + /// E.g. pick single quotes for `'A "dog"'` because using single quotes would require escaping + /// the two `"`. + Regular { + single_quotes: u32, + double_quotes: u32, + }, } -/// Choose the appropriate quote style for a string. -/// -/// For single quoted strings, the preferred quote style is used, unless the alternative quote style -/// would require fewer escapes. -/// -/// For triple quoted strings, the preferred quote style is always used, unless the string contains -/// a triplet of the quote character (e.g., if double quotes are preferred, double quotes will be -/// used unless the string contains `"""`). -fn choose_quotes_impl( - input: &str, - flags: AnyStringFlags, - preferred_quote: Quote, -) -> AnyStringFlags { - let quote = if flags.is_triple_quoted() { +impl QuoteMetadataKind { + /// For triple quoted strings, the preferred quote style can't be used if the string contains + /// a tripled of the quote character (e.g., if double quotes are preferred, double quotes will be + /// used unless the string contains `"""`). + fn triple_quoted(content: &str, preferred_quote: Quote) -> Self { // True if the string contains a triple quote sequence of the configured quote style. let mut uses_triple_quotes = false; - let mut chars = input.chars().peekable(); + let mut chars = content.chars().peekable(); while let Some(c) = chars.next() { let preferred_quote_char = preferred_quote.as_char(); @@ -369,18 +358,18 @@ fn choose_quotes_impl( } } - if uses_triple_quotes { - // String contains a triple quote sequence of the configured quote style. - // Keep the existing quote style. - flags.quote_style() - } else { - preferred_quote + Self::Triple { + contains_preferred: uses_triple_quotes, } - } else { + } + + /// For single quoted strings, the preferred quote style is used, unless the alternative quote style + /// would require fewer escapes. + fn regular(text: &str) -> Self { let mut single_quotes = 0u32; let mut double_quotes = 0u32; - for c in input.chars() { + for c in text.chars() { match c { '\'' => { single_quotes += 1; @@ -394,25 +383,105 @@ fn choose_quotes_impl( } } - match single_quotes.cmp(&double_quotes) { - Ordering::Less => Quote::Single, - Ordering::Equal => preferred_quote, - Ordering::Greater => Quote::Double, + Self::Regular { + single_quotes, + double_quotes, } - }; + } + + /// Computes if a raw string uses the preferred quote. If it does, then it's not possible + /// to change the quote style because it would require escaping which isn't possible in raw strings. + fn raw(text: &str, preferred: Quote, triple_quoted: bool) -> Self { + let mut chars = text.chars().peekable(); + let preferred_quote_char = preferred.as_char(); + + let contains_unescaped_configured_quotes = loop { + match chars.next() { + Some('\\') => { + // Ignore escaped characters + chars.next(); + } + // `"` or `'` + Some(c) if c == preferred_quote_char => { + if !triple_quoted { + break true; + } + + match chars.peek() { + // We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser + // about where the closing triple quotes start + None => break true, + Some(next) if *next == preferred_quote_char => { + // `""` or `''` + chars.next(); + + // We can't turn `r'''""'''` into `r""""""""`, nor can we have + // `"""` or `'''` respectively inside the string + if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char) + { + break true; + } + } + _ => {} + } + } + Some(_) => continue, + None => break false, + } + }; + + Self::Raw { + contains_preferred: contains_unescaped_configured_quotes, + } + } +} + +#[derive(Debug)] +pub(crate) struct NormalizedString<'a> { + /// Holds data about the quotes and prefix of the string + flags: AnyStringFlags, - flags.with_quote_style(quote) + /// The range of the string's content in the source (minus prefix and quotes). + content_range: TextRange, + + /// The normalized text + text: Cow<'a, str>, +} + +impl<'a> NormalizedString<'a> { + pub(crate) fn text(&self) -> &Cow<'a, str> { + &self.text + } + + pub(crate) fn flags(&self) -> AnyStringFlags { + self.flags + } +} + +impl Ranged for NormalizedString<'_> { + fn range(&self) -> TextRange { + self.content_range + } +} + +impl Format> for NormalizedString<'_> { + fn fmt(&self, f: &mut Formatter>) -> FormatResult<()> { + let quotes = StringQuotes::from(self.flags); + ruff_formatter::write!(f, [self.flags.prefix(), quotes])?; + match &self.text { + Cow::Borrowed(_) => source_text_slice(self.range()).fmt(f)?, + Cow::Owned(normalized) => text(normalized).fmt(f)?, + } + + quotes.fmt(f) + } } -/// Adds the necessary quote escapes and removes unnecessary escape sequences when quoting `input` -/// with the provided [`StringQuotes`] style. -/// -/// Returns the normalized string and whether it contains new lines. pub(crate) fn normalize_string( input: &str, start_offset: usize, - flags: AnyStringFlags, - format_fstring: bool, + new_flags: AnyStringFlags, + format_f_string: bool, ) -> Cow { // The normalized string if `input` is not yet normalized. // `output` must remain empty if `input` is already normalized. @@ -421,18 +490,19 @@ pub(crate) fn normalize_string( // If `last_index` is `0` at the end, then the input is already normalized and can be returned as is. let mut last_index = 0; - let quote = flags.quote_style(); + let quote = new_flags.quote_style(); let preferred_quote = quote.as_char(); let opposite_quote = quote.opposite().as_char(); let mut chars = CharIndicesWithOffset::new(input, start_offset).peekable(); - let is_raw = flags.is_raw_string(); - let is_fstring = !format_fstring && flags.is_f_string(); + let is_raw = new_flags.is_raw_string(); + + let is_fstring = !format_f_string && new_flags.is_f_string(); let mut formatted_value_nesting = 0u32; while let Some((index, c)) = chars.next() { - if is_fstring && matches!(c, '{' | '}') { + if matches!(c, '{' | '}') && is_fstring { if chars.peek().copied().is_some_and(|(_, next)| next == c) { // Skip over the second character of the double braces chars.next(); @@ -444,6 +514,7 @@ pub(crate) fn normalize_string( } continue; } + if c == '\r' { output.push_str(&input[last_index..index]); @@ -466,8 +537,10 @@ pub(crate) fn normalize_string( } else { // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`) let escape_start_len = '\\'.len_utf8() + next.len_utf8(); - if let Some(normalised) = UnicodeEscape::new(next, !flags.is_byte_string()) - .and_then(|escape| escape.normalize(&input[index + escape_start_len..])) + if let Some(normalised) = + UnicodeEscape::new(next, !new_flags.is_byte_string()).and_then( + |escape| escape.normalize(&input[index + escape_start_len..]), + ) { let escape_start_offset = index + escape_start_len; if let Cow::Owned(normalised) = &normalised { @@ -485,7 +558,7 @@ pub(crate) fn normalize_string( } } - if !flags.is_triple_quoted() { + if !new_flags.is_triple_quoted() { #[allow(clippy::if_same_then_else)] if next == opposite_quote && formatted_value_nesting == 0 { // Remove the escape by ending before the backslash and starting again with the quote @@ -498,7 +571,7 @@ pub(crate) fn normalize_string( } } } - } else if !flags.is_triple_quoted() + } else if !new_flags.is_triple_quoted() && c == preferred_quote && formatted_value_nesting == 0 { @@ -511,14 +584,12 @@ pub(crate) fn normalize_string( } } - let normalized = if last_index == 0 { + if last_index == 0 { Cow::Borrowed(input) } else { output.push_str(&input[last_index..]); Cow::Owned(output) - }; - - normalized + } } #[derive(Clone, Debug)] @@ -671,14 +742,14 @@ impl UnicodeEscape { mod tests { use std::borrow::Cow; + use super::UnicodeEscape; + use crate::string::normalize_string; use ruff_python_ast::{ str::Quote, str_prefix::{AnyStringPrefix, ByteStringPrefix}, AnyStringFlags, }; - use super::{normalize_string, UnicodeEscape}; - #[test] fn normalize_32_escape() { let escape_sequence = UnicodeEscape::new('U', true).unwrap();