Skip to content

Commit 8036567

Browse files
committed
Improve XML parsing and formatting for cleaner content extraction
Simplifies XML handling by removing pretty formatting, adds HTML entity processing, cleans up whitespace, and adds tag-specific value processing. Also removes unnecessary attributes and preserveOrder flag for more efficient parsing. Adds content-focused tag filtering to exclude navigation, header and footer elements.
1 parent 4f4b393 commit 8036567

File tree

1 file changed

+29
-11
lines changed

1 file changed

+29
-11
lines changed

scripts/substack-minutes-to-scenario.ts

+29-11
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ import { promises as fs } from 'fs'
22
import path from 'path'
33
import { encoding_for_model } from '@dqbd/tiktoken'
44
import { XMLBuilder, XMLParser } from 'fast-xml-parser'
5-
import pretty from 'pretty'
65
import logger from '../src/logger'
76

87
const POSTS_DIR = 'posts'
@@ -30,22 +29,41 @@ const countTokens = (text: string): number => {
3029

3130
const formatXml = (posts: Post[]): string => {
3231
const parser = new XMLParser({
33-
ignoreAttributes: false,
34-
preserveOrder: true,
32+
ignoreAttributes: true,
33+
preserveOrder: false,
3534
parseTagValue: true,
36-
parseAttributeValue: true,
37-
trimValues: false,
35+
parseAttributeValue: false,
36+
trimValues: true,
3837
unpairedTags: ["img", "br", "hr", "source"],
39-
stopNodes: ["*.script", "*.style"]
38+
stopNodes: [
39+
"*.script",
40+
"*.style",
41+
"*.nav",
42+
"*.footer",
43+
"*.header"
44+
],
45+
tagValueProcessor: (tagName: string, value: string) => {
46+
return value
47+
.replace(/ /g, ' ')
48+
.replace(/&/g, '&')
49+
.replace(/&lt;/g, '<')
50+
.replace(/&gt;/g, '>')
51+
.replace(/\s+/g, ' ')
52+
}
4053
})
4154

4255
const builder = new XMLBuilder({
43-
format: true,
44-
indentBy: ' ',
56+
format: false,
4557
suppressEmptyNode: true,
46-
ignoreAttributes: false,
47-
preserveOrder: true,
48-
unpairedTags: ["img", "br", "hr", "source"]
58+
ignoreAttributes: true,
59+
preserveOrder: false,
60+
unpairedTags: ["img", "br", "hr", "source"],
61+
tagValueProcessor: (tagName: string, value: unknown) => {
62+
if (typeof value === 'string' && ["p", "article", "section", "h1", "h2", "h3"].includes(tagName)) {
63+
return value.trim();
64+
}
65+
return value;
66+
}
4967
})
5068

5169
const postsData = posts.map(p => {

0 commit comments

Comments
 (0)