Skip to content

Commit 17e9894

Browse files
committed
improve-url-extraction
1 parent 9cb26c7 commit 17e9894

File tree

2 files changed

+21
-2
lines changed

2 files changed

+21
-2
lines changed

src/index.test.ts

+5-1
Original file line numberDiff line numberDiff line change
@@ -251,9 +251,13 @@ hahev11@walla.com
251251
252252
רוצה לדעת לפני כולם מה קורה?
253253
להצטרפות לקבוצות של מה קורה היום? >> makorehayom.info/48YR0Sw`,
254-
["http://shiri-livny.com/adamaretreat", "http://makorehayom.info/48YR0Sw"],
254+
["Shiri-livny.com/adamaretreat", "makorehayom.info/48YR0Sw"],
255255
],
256256
["", []],
257+
[
258+
"https://gigtickets.co.il/il//eid/101",
259+
["https://gigtickets.co.il/il//eid/101"],
260+
],
257261
]);
258262

259263
testUnaryFn("telegram handlers", telegramHandlesInText)([

src/index.ts

+16-1
Original file line numberDiff line numberDiff line change
@@ -393,9 +393,24 @@ const removeEmails = replace(
393393
"",
394394
);
395395

396+
const extractUrls = (text: string) => {
397+
const urlRegex =
398+
/(?:(?:https?|ftp):\/\/)?[\w.-]+(?:\.[\w.-]+)+[\w\-._~:/?#[\]@!$&'()*+,;=]*/g;
399+
const urls = text.match(urlRegex) || [];
400+
// Filter out things that look like version numbers or other unwanted patterns
401+
const filteredUrls = urls.filter((url) => {
402+
// Basic filter to exclude things that look like version numbers or paths beginning with numbers.
403+
if (/^\d+\.\d+/.test(url) || /^\d+\/\//.test(url)) {
404+
return false;
405+
}
406+
return true;
407+
});
408+
return filteredUrls;
409+
};
410+
396411
export const urlsInText = (
397412
x: string,
398-
) => [...getUrls(removeEmails(x).replace(/\b[^\s]+.(jpg|png|jpeg)\b/g, ""))];
413+
) => extractUrls(removeEmails(x).replace(/\b[^\s]+.(jpg|png|jpeg)\b/g, ""));
399414

400415
type Keywords = { keywords: string[]; antiKeywords?: string[] };
401416

0 commit comments

Comments
 (0)