Skip to content

Commit

Permalink
Merge pull request #8 from MarvNC:add-zhuyin
Browse files Browse the repository at this point in the history
Add Zhuyin Support
  • Loading branch information
MarvNC authored Aug 13, 2024
2 parents 500f976 + 863bce7 commit 29cca91
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 52 deletions.
Binary file modified bun.lockb
Binary file not shown.
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
"dependencies": {
"is-cjk-hanzi": "^1.0.0",
"pinyin-tone": "^2.2.0",
"yomichan-dict-builder": "^2.7.0"
"yomichan-dict-builder": "^2.7.0",
"zhuyin-improved": "^1.0.7"
},
"devDependencies": {
"@types/node": "^20.14.12",
Expand Down
4 changes: 3 additions & 1 deletion src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ export const FILE_NAME = 'cedict_1_0_ts_utf-8_mdbg.txt';
export const BUILD_DIR = './build';
export const DATA_DIR = './data';
export const TERM_ZIP_NAME = 'CC-CEDICT.zip';
export const ZHUYIN_ZIP_NAME = 'CC-CEDICT.Zhuyin.zip';
export const HANZI_ZIP_NAME = 'CC-CEDICT.Hanzi.zip';
export const TERM_INDEX_NAME = 'term_index.json';
export const HANZI_INDEX_NAME = 'hanzi_index.json';
export const ZHUYIN_INDEX_NAME = 'zhuyin_index.json';
export const HANZI_INDEX_NAME = 'hanzi_index.json';
31 changes: 24 additions & 7 deletions src/dictionaryUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,35 @@ import { isCJKHanzi } from 'is-cjk-hanzi';

export async function processLine(
line: string,
termDict: Dictionary,
pinyinDict: Dictionary,
zhuyinDict: Dictionary,
hanziDict: Dictionary,
lineNumber: number
): Promise<void> {
const { traditional, simplified, pinyin, definitionArray } = parseLine(line);
const {
traditional,
simplified,
pinyin,
zhuyin,
pinyinDefinitionArray,
zhuyinDefinitionArray,
} = parseLine(line);

await addTermEntry(
termDict,
pinyinDict,
traditional,
simplified,
pinyin,
definitionArray,
pinyinDefinitionArray,
lineNumber
);

await addTermEntry(
zhuyinDict,
traditional,
simplified,
zhuyin,
zhuyinDefinitionArray,
lineNumber
);

Expand All @@ -24,7 +41,7 @@ export async function processLine(
traditional,
simplified,
pinyin,
definitionArray
pinyinDefinitionArray
);
}

Expand Down Expand Up @@ -57,12 +74,12 @@ async function addTermEntry(
termDict: Dictionary,
traditional: string,
simplified: string,
pinyin: string,
reading: string,
definitionArray: string[],
sequenceNumber: number
): Promise<void> {
const termEntry = new TermEntry(traditional)
.setReading(pinyin)
.setReading(reading)
.setSequenceNumber(sequenceNumber);

// Build definition
Expand Down
71 changes: 45 additions & 26 deletions src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import {
HANZI_ZIP_NAME,
TERM_INDEX_NAME,
HANZI_INDEX_NAME,
ZHUYIN_ZIP_NAME,
ZHUYIN_INDEX_NAME,
} from './config';
import { processLine } from './dictionaryUtils';
import { parseComments } from './fileUtils';
Expand All @@ -31,13 +33,14 @@ async function main() {
const { creationDateClean } = parseComments(lines);
console.log(`Creation date: ${creationDateClean}`);

const termDict = new Dictionary({ fileName: TERM_ZIP_NAME });
const pinyinDict = new Dictionary({ fileName: TERM_ZIP_NAME });
const zhuyinDict = new Dictionary({ fileName: ZHUYIN_ZIP_NAME });
const hanziDict = new Dictionary({ fileName: HANZI_ZIP_NAME });

// Parse entries
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
await processLine(line, termDict, hanziDict, i);
await processLine(line, pinyinDict, zhuyinDict, hanziDict, i);
if (i % 1000 === 0) {
console.log(`Processed ${i} lines...`);
}
Expand All @@ -48,38 +51,54 @@ async function main() {

const index = createDictionaryIndex(creationDateClean);

// Export term dict
index.setIndexUrl(
`https://github.com/MarvNC/cc-cedict-yomitan/releases/latest/download/${TERM_INDEX_NAME}`
// Export dictionaries
await exportDictionary(
index,
pinyinDict,
TERM_INDEX_NAME,
TERM_ZIP_NAME,
BUILD_DIR
);
index.setDownloadUrl(
`https://github.com/MarvNC/cc-cedict-yomitan/releases/latest/download/${TERM_ZIP_NAME}`
await exportDictionary(
index,
zhuyinDict,
ZHUYIN_INDEX_NAME,
ZHUYIN_ZIP_NAME,
BUILD_DIR,
`CC-CEDICT Zhuyin [${creationDateClean}]`
);
await Bun.write(
join(BUILD_DIR, TERM_INDEX_NAME),
JSON.stringify(index.build())
await exportDictionary(
index,
hanziDict,
HANZI_INDEX_NAME,
HANZI_ZIP_NAME,
BUILD_DIR,
`CC-CEDICT Hanzi [${creationDateClean}]`
);
await termDict.setIndex(index.build());
const termDictStats = await termDict.export(BUILD_DIR);
console.log(`Exported ${termDictStats.termCount} terms.`);
console.log(`Wrote ${TERM_ZIP_NAME} to ${BUILD_DIR}.`);
}

// Export hanzi dict
async function exportDictionary(
index: DictionaryIndex,
dictionary: Dictionary,
indexName: string,
zipName: string,
buildDir: string,
title?: string
) {
index.setIndexUrl(
`https://github.com/MarvNC/cc-cedict-yomitan/releases/latest/download/${HANZI_INDEX_NAME}`
`https://github.com/MarvNC/cc-cedict-yomitan/releases/latest/download/${indexName}`
);
index.setDownloadUrl(
`https://github.com/MarvNC/cc-cedict-yomitan/releases/latest/download/${HANZI_ZIP_NAME}`
`https://github.com/MarvNC/cc-cedict-yomitan/releases/latest/download/${zipName}`
);
await Bun.write(
join(BUILD_DIR, HANZI_INDEX_NAME),
JSON.stringify(index.build())
);
index.setTitle(`CC-CEDICT Hanzi [${creationDateClean}]`);
await hanziDict.setIndex(index.build());
const hanziDictStats = await hanziDict.export(BUILD_DIR);
console.log(`Exported ${hanziDictStats.kanjiCount} hanzi.`);
console.log(`Wrote ${HANZI_ZIP_NAME} to ${BUILD_DIR}.`);
if (title) {
index.setTitle(title);
}
await Bun.write(join(buildDir, indexName), JSON.stringify(index.build()));
await dictionary.setIndex(index.build());
const dictStats = await dictionary.export(buildDir);
console.log(`Exported ${dictStats.termCount || dictStats.kanjiCount} items.`);
console.log(`Wrote ${zipName} to ${buildDir}.`);
}

function createDictionaryIndex(creationDateClean: string): DictionaryIndex {
Expand Down
41 changes: 30 additions & 11 deletions src/parseLine.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import { normalizePinyin, replacePinyinNumbers } from './pinyinUtils';
import { getPinyin, getZhuyin, replacePinyinNumbers } from './pinyinUtils';
import type { ParsedLine } from './types';

export function parseLine(line: string): ParsedLine {
const lineArr = line.split('');
let traditional = '';
let simplified = '';
let pinyin = '';
let pinyinNumbers = '';
let english = '';

while (lineArr[0] !== ' ') {
Expand All @@ -23,7 +23,7 @@ export function parseLine(line: string): ParsedLine {
}
lineArr.shift(); // [
while (lineArr[0] !== ']') {
pinyin += lineArr.shift();
pinyinNumbers += lineArr.shift();
}
lineArr.shift(); // ]
if (lineArr[0] !== ' ') {
Expand All @@ -33,21 +33,40 @@ export function parseLine(line: string): ParsedLine {
english = lineArr.join('');

// Process
// Convert pinyin to tone
pinyin = normalizePinyin(pinyin);
// Convert pinyin
const pinyin = getPinyin(pinyinNumbers);

// Remove spaces
pinyin = pinyin.replace(/ /g, '');
// Zhuyin
const zhuyin = getZhuyin(pinyinNumbers);

// Convert number pinyin in definition to tone
english = replacePinyinNumbers(english);

const definitionArray = english.split('/').filter((e) => e.trim() !== '');
const { pinyinDefinitionArray, zhuyinDefinitionArray } =
processDefinitionText(english);

return {
traditional,
simplified,
pinyin,
definitionArray,
zhuyin,
pinyinDefinitionArray,
zhuyinDefinitionArray,
};
}

function processDefinitionText(text: string): {
pinyinDefinitionArray: string[];
zhuyinDefinitionArray: string[];
} {
const english = text;

const processText = (usePinyin: boolean) =>
replacePinyinNumbers(english, usePinyin)
.split('/')
.filter((e) => e.trim() !== '');

// Process pinyin and zhuyin
const pinyinDefinitionArray = processText(true);
const zhuyinDefinitionArray = processText(false);

return { pinyinDefinitionArray, zhuyinDefinitionArray };
}
23 changes: 18 additions & 5 deletions src/pinyinUtils.ts
Original file line number Diff line number Diff line change
@@ -1,21 +1,34 @@
import pinyinNumbersToTone from 'pinyin-tone';
import zhuyin from 'zhuyin-improved';

export function normalizePinyin(pinyin: string): string {
export function getPinyin(pinyin: string): string {
pinyin = replaceUWithV(pinyin);
return pinyinNumbersToTone(pinyin.toLowerCase()).replace(/ /g, '');
}

export function replaceUWithV(pinyin: string) {
pinyin = pinyin.replace(/u:/g, 'v');
return pinyinNumbersToTone(pinyin.toLowerCase());
return pinyin;
}

export function replacePinyinNumbers(string: string): string {
export function replacePinyinNumbers(string: string, pinyin: boolean): string {
// Find all pinyin within the definition and replace with tone
const pinyinRegex = /\[(([a-zA-Z\:]+)([1-5]) ?)+\]/g;
const pinyinMatches = string.match(pinyinRegex);
if (pinyinMatches) {
for (const match of pinyinMatches) {
// Remove brackets
const pinyinOnly = match.substring(1, match.length - 1);
const pinyinTone = normalizePinyin(pinyinOnly);
string = string.replace(pinyinOnly, pinyinTone);
const processedText = pinyin
? getPinyin(pinyinOnly)
: getZhuyin(pinyinOnly);
string = string.replace(pinyinOnly, processedText);
}
}
return string;
}

export function getZhuyin(pinyin: string): string {
pinyin = replaceUWithV(pinyin);
return zhuyin(pinyin.toLowerCase()).join('');
}
4 changes: 3 additions & 1 deletion src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,7 @@ export interface ParsedLine {
traditional: string;
simplified: string;
pinyin: string;
definitionArray: string[];
zhuyin: string;
pinyinDefinitionArray: string[];
zhuyinDefinitionArray: string[];
}

0 comments on commit 29cca91

Please sign in to comment.