Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Footnotes: Add block-level caching when parsing content for footnotes #52577

Merged
merged 8 commits into from
Sep 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 2 additions & 134 deletions packages/core-data/src/entity-provider.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,17 @@ import {
} from '@wordpress/element';
import { useSelect, useDispatch } from '@wordpress/data';
import { parse, __unstableSerializeAndClean } from '@wordpress/blocks';
import { privateApis as blockEditorPrivateApis } from '@wordpress/block-editor';

/**
* Internal dependencies
*/
import { STORE_NAME } from './name';
import { unlock } from './private-apis';
import { updateFootnotesFromMeta } from './footnotes';

/** @typedef {import('@wordpress/blocks').WPBlock} WPBlock */

const EMPTY_ARRAY = [];

let oldFootnotes = {};

/**
* Internal dependencies
*/
Expand Down Expand Up @@ -182,136 +179,7 @@ export function useEntityBlockEditor( kind, name, { id: _id } = {} ) {
}, [ editedBlocks, content ] );

const updateFootnotes = useCallback(
( _blocks ) => {
const output = { blocks: _blocks };
if ( ! meta ) return output;
// If meta.footnotes is empty, it means the meta is not registered.
if ( meta.footnotes === undefined ) return output;

const { getRichTextValues } = unlock( blockEditorPrivateApis );
const _content = getRichTextValues( _blocks ).join( '' ) || '';
const newOrder = [];

// This can be avoided when
// https://github.com/WordPress/gutenberg/pull/43204 lands. We can then
// get the order directly from the rich text values.
if ( _content.indexOf( 'data-fn' ) !== -1 ) {
const regex = /data-fn="([^"]+)"/g;
let match;
while ( ( match = regex.exec( _content ) ) !== null ) {
newOrder.push( match[ 1 ] );
}
}

const footnotes = meta.footnotes
? JSON.parse( meta.footnotes )
: [];
const currentOrder = footnotes.map( ( fn ) => fn.id );

if ( currentOrder.join( '' ) === newOrder.join( '' ) )
return output;

const newFootnotes = newOrder.map(
( fnId ) =>
footnotes.find( ( fn ) => fn.id === fnId ) ||
oldFootnotes[ fnId ] || {
id: fnId,
content: '',
}
);

function updateAttributes( attributes ) {
// Only attempt to update attributes, if attributes is an object.
if (
! attributes ||
Array.isArray( attributes ) ||
typeof attributes !== 'object'
) {
return attributes;
}

attributes = { ...attributes };

for ( const key in attributes ) {
const value = attributes[ key ];

if ( Array.isArray( value ) ) {
attributes[ key ] = value.map( updateAttributes );
continue;
}

if ( typeof value !== 'string' ) {
continue;
}

if ( value.indexOf( 'data-fn' ) === -1 ) {
continue;
}

// When we store rich text values, this would no longer
// require a regex.
const regex =
/(<sup[^>]+data-fn="([^"]+)"[^>]*><a[^>]*>)[\d*]*<\/a><\/sup>/g;

attributes[ key ] = value.replace(
regex,
( match, opening, fnId ) => {
const index = newOrder.indexOf( fnId );
return `${ opening }${ index + 1 }</a></sup>`;
}
);

const compatRegex =
/<a[^>]+data-fn="([^"]+)"[^>]*>\*<\/a>/g;

attributes[ key ] = attributes[ key ].replace(
compatRegex,
( match, fnId ) => {
const index = newOrder.indexOf( fnId );
return `<sup data-fn="${ fnId }" class="fn"><a href="#${ fnId }" id="${ fnId }-link">${
index + 1
}</a></sup>`;
}
);
}

return attributes;
}

function updateBlocksAttributes( __blocks ) {
return __blocks.map( ( block ) => {
return {
...block,
attributes: updateAttributes( block.attributes ),
innerBlocks: updateBlocksAttributes(
block.innerBlocks
),
};
} );
}

// We need to go through all block attributes deeply and update the
// footnote anchor numbering (textContent) to match the new order.
const newBlocks = updateBlocksAttributes( _blocks );

oldFootnotes = {
...oldFootnotes,
...footnotes.reduce( ( acc, fn ) => {
if ( ! newOrder.includes( fn.id ) ) {
acc[ fn.id ] = fn;
}
return acc;
}, {} ),
};

return {
meta: {
...meta,
footnotes: JSON.stringify( newFootnotes ),
},
blocks: newBlocks,
};
},
( _blocks ) => updateFootnotesFromMeta( _blocks, meta ),
[ meta ]
);

Expand Down
30 changes: 30 additions & 0 deletions packages/core-data/src/footnotes/get-footnotes-order.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/**
* Internal dependencies
*/
import getRichTextValuesCached from './get-rich-text-values-cached';

const cache = new WeakMap();

function getBlockFootnotesOrder( block ) {
if ( ! cache.has( block ) ) {
const content = getRichTextValuesCached( block ).join( '' );
const newOrder = [];

// https://github.com/WordPress/gutenberg/pull/43204 lands. We can then
// get the order directly from the rich text values.
if ( content.indexOf( 'data-fn' ) !== -1 ) {
const regex = /data-fn="([^"]+)"/g;
let match;
while ( ( match = regex.exec( content ) ) !== null ) {
newOrder.push( match[ 1 ] );
}
}
Copy link
Member

@dmsnell dmsnell Jul 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

given that we have easy access to a DOM here, why parse the HTML with the regular expressions? would it be just the same if we did this methodically instead? particularly since we're caching the values?

if ( ! content.includes( 'data-fn' ) ) {
	cache.set( block, [] );
}

const dom = doTheStuffWeDoToParseHTML( content );
dom.querySelectorAll( '[data-fn]' ).forEach( ( node ) => {
	newOrder.push( node.getAttribute( 'data-fn' ) );
} )

cache.set( block, newOrder );
}

return cache.get( block );
}

export default function getFootnotesOrder( blocks ) {
return blocks.flatMap( getBlockFootnotesOrder );
}
35 changes: 35 additions & 0 deletions packages/core-data/src/footnotes/get-rich-text-values-cached.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/**
* WordPress dependencies
*/
import { privateApis as blockEditorPrivateApis } from '@wordpress/block-editor';

/**
* Internal dependencies
*/
import { unlock } from '../private-apis';

// TODO: The following line should have been:
//
// const unlockedApis = unlock( blockEditorPrivateApis );
//
// But there are hidden circular dependencies in RNMobile code, specifically in
// certain native components in the `components` package that depend on
// `block-editor`. What follows is a workaround that defers the `unlock` call
// to prevent native code from failing.
//
// Fix once https://github.com/WordPress/gutenberg/issues/52692 is closed.
let unlockedApis;

const cache = new WeakMap();

export default function getRichTextValuesCached( block ) {
if ( ! unlockedApis ) {
unlockedApis = unlock( blockEditorPrivateApis );
}

if ( ! cache.has( block ) ) {
const values = unlockedApis.getRichTextValues( [ block ] );
cache.set( block, values );
}
return cache.get( block );
}
119 changes: 119 additions & 0 deletions packages/core-data/src/footnotes/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/**
* Internal dependencies
*/
import getFootnotesOrder from './get-footnotes-order';

let oldFootnotes = {};

export function updateFootnotesFromMeta( blocks, meta ) {
const output = { blocks };
if ( ! meta ) return output;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this syntax makes me so nervous. it's been the vector of some of the biggest attacks and looks otherwise so innocent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you clarify? Is it the lack of braces around the THEN block?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah that's right. it's essentially the same discussion as with automatic semicolon insertion, which the industry has overwhelmingly rejected. I thought we overwhelmingly rejected code blocks without braces after major exploits (notably the OpenSSL-bypass).

even if ( ! meta ) { return output; } retains the safety that if ( ! meta ) return output; eschews, so giving up the safety doesn't even avoid any lines.

there are some trivial cases, particularly with comments, that can fool auto-formatters into creating the bugs. formatters move comments around, and the newlines and comments themselves can rip apart these coupled blocks. the braces tie them together.

curious what the advantages are of doing it this way


// If meta.footnotes is empty, it means the meta is not registered.
if ( meta.footnotes === undefined ) return output;

const newOrder = getFootnotesOrder( blocks );

const footnotes = meta.footnotes ? JSON.parse( meta.footnotes ) : [];
const currentOrder = footnotes.map( ( fn ) => fn.id );

if ( currentOrder.join( '' ) === newOrder.join( '' ) ) return output;

const newFootnotes = newOrder.map(
( fnId ) =>
footnotes.find( ( fn ) => fn.id === fnId ) ||
oldFootnotes[ fnId ] || {
id: fnId,
content: '',
}
);

function updateAttributes( attributes ) {
// Only attempt to update attributes, if attributes is an object.
if (
! attributes ||
Array.isArray( attributes ) ||
typeof attributes !== 'object'
) {
return attributes;
}

attributes = { ...attributes };

for ( const key in attributes ) {
const value = attributes[ key ];

if ( Array.isArray( value ) ) {
attributes[ key ] = value.map( updateAttributes );
continue;
}

if ( typeof value !== 'string' ) {
continue;
}

if ( value.indexOf( 'data-fn' ) === -1 ) {
continue;
}

// When we store rich text values, this would no longer
// require a regex.
Copy link
Member

@dmsnell dmsnell Jul 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no regex is required today, no? and it's JS so it's easy to build a DOM of the HTML and then modify it without risking more corruption.

it's also unclear to me what our purpose is here, partially because of the obscurity of the regex patterns themselves. regex is moving the footnote id into the link text?

const dom = parseHTML( value );

// Set the footnote id as the link's text.
dom.querySelectorAll( 'sup[data-fn] > a' ).forEach( ( node ) => {
	const fnId = node.parentNode.getAttribute( 'data-fn' );

	node.innerText = newOrder.indexOf( fnId ) + 1;
} );

// Replace footnotes that are only anchor tags with the <sup>-wrapped version.
// From: <a data-fn="a431c">a431c</a>
//   To: <sup data-fn="a431c" class="fn"><a href="#a431c" id="a431c-link">a431c</a></sup>
dom.querySelectorAll( 'a[data-fn]' ).forEach( ( node ) => {
	const fnId = node.parentNode.getAttribute( 'data-fn' );

	const sup = dom.createElement( 'sup' );
	sup.setAttribute( 'data-fn', fnId );
	sup.classList.add( 'fn' );

	const link = dom.createElement( 'a' );
	link.setAttribute( 'href', `#${ fnId }` );
	link.setAttribute( 'id', `${ fnId }-link` );
	link.innerText = fnId + 1;

	sup.appendChild( link );
	node.parentNode.replaceNode( node, sup );
} );

attributes[ key ] = dom.outerHTML;

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These regexes are just temporary until we get rich text values in the store. If we can ever do that at least :(
We could cache the rich text values instead somewhere, but we can't use a weak map since the key would be a string... Any ideas?

I would prefer if we could cache the rich text values themselves somewhere instead of caching rich text values for a block.

const regex =
/(<sup[^>]+data-fn="([^"]+)"[^>]*><a[^>]*>)[\d*]*<\/a><\/sup>/g;

attributes[ key ] = value.replace(
regex,
( match, opening, fnId ) => {
const index = newOrder.indexOf( fnId );
return `${ opening }${ index + 1 }</a></sup>`;
}
);

const compatRegex = /<a[^>]+data-fn="([^"]+)"[^>]*>\*<\/a>/g;

attributes[ key ] = attributes[ key ].replace(
compatRegex,
( match, fnId ) => {
const index = newOrder.indexOf( fnId );
return `<sup data-fn="${ fnId }" class="fn"><a href="#${ fnId }" id="${ fnId }-link">${
index + 1
}</a></sup>`;
}
);
}

return attributes;
}

function updateBlocksAttributes( __blocks ) {
return __blocks.map( ( block ) => {
return {
...block,
attributes: updateAttributes( block.attributes ),
innerBlocks: updateBlocksAttributes( block.innerBlocks ),
};
} );
}

// We need to go through all block attributes deeply and update the
// footnote anchor numbering (textContent) to match the new order.
const newBlocks = updateBlocksAttributes( blocks );

oldFootnotes = {
...oldFootnotes,
...footnotes.reduce( ( acc, fn ) => {
if ( ! newOrder.includes( fn.id ) ) {
acc[ fn.id ] = fn;
}
return acc;
}, {} ),
};

return {
meta: {
...meta,
footnotes: JSON.stringify( newFootnotes ),
},
blocks: newBlocks,
};
}