forked from zolrath/obsidian-auto-link-title
-
Notifications
You must be signed in to change notification settings - Fork 0
/
electron-scraper.ts
133 lines (112 loc) · 3.4 KB
/
electron-scraper.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
const electronPkg = require("electron");
import { request } from "obsidian";
function blank(text: string): boolean {
return text === undefined || text === null || text === "";
}
function notBlank(text: string): boolean {
return !blank(text);
}
// async wrapper to load a url and settle on load finish or fail
async function load(window: any, url: string): Promise<void> {
return new Promise<void>((resolve, reject) => {
window.webContents.on("did-finish-load", (event: any) => resolve(event));
window.webContents.on("did-fail-load", (event: any) => reject(event));
window.loadURL(url);
});
}
async function electronGetPageTitle(url: string): Promise<string> {
const { remote } = electronPkg;
const { BrowserWindow } = remote;
try {
const window = new BrowserWindow({
width: 1000,
height: 600,
webPreferences: {
webSecurity: false,
nodeIntegration: true,
images: false,
},
show: false,
});
window.webContents.setAudioMuted(true);
await load(window, url);
try {
const title = window.webContents.getTitle();
window.destroy();
if (notBlank(title)) {
return title;
} else {
return url;
}
} catch (ex) {
window.destroy();
return url;
}
} catch (ex) {
console.error(ex);
return "Site Unreachable";
}
}
async function nonElectronGetPageTitle(url: string): Promise<string> {
try {
const html = await request({ url });
const doc = new DOMParser().parseFromString(html, "text/html");
const title = doc.querySelectorAll("title")[0];
if (title == null || blank(title?.innerText)) {
// If site is javascript based and has a no-title attribute when unloaded, use it.
var noTitle = title?.getAttr("no-title");
if (notBlank(noTitle)) {
return noTitle;
}
// Otherwise if the site has no title/requires javascript simply return Title Unknown
return url;
}
return title.innerText;
} catch (ex) {
console.error(ex);
return "Site Unreachable";
}
}
function getUrlFinalSegment(url: string): string {
try {
const segments = new URL(url).pathname.split('/');
const last = segments.pop() || segments.pop(); // Handle potential trailing slash
return last;
} catch (_) {
return "File"
}
}
async function tryGetFileType(url: string) {
try {
const response = await fetch(url, { method: "HEAD" });
// Ensure site returns an ok status code before scraping
if (!response.ok) {
return "Site Unreachable";
}
// Ensure site is an actual HTML page and not a pdf or 3 gigabyte video file.
let contentType = response.headers.get("content-type");
if (!contentType.includes("text/html")) {
return getUrlFinalSegment(url);
}
return null;
} catch (err) {
return null;
}
}
export default async function getPageTitle(url: string): Promise<string> {
// If we're on Desktop use the Electron scraper
if (!(url.startsWith("http") || url.startsWith("https"))) {
url = "https://" + url;
}
// Try to do a HEAD request to see if the site is reachable and if it's an HTML page
// If we error out due to CORS, we'll just try to scrape the page anyway.
let fileType = await tryGetFileType(url);
if (fileType) {
return fileType;
}
if (electronPkg != null) {
return electronGetPageTitle(url);
} else {
return nonElectronGetPageTitle(url);
}
}