Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core(fetcher): remove iframe fetcher #13923

Merged
merged 15 commits into from
May 5, 2022
2 changes: 1 addition & 1 deletion lighthouse-core/fraggle-rock/gather/driver.js
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class Driver {
const session = await this._page.target().createCDPSession();
this._session = this.defaultSession = new ProtocolSession(session);
this._executionContext = new ExecutionContext(this._session);
this._fetcher = new Fetcher(this._session, this._executionContext);
this._fetcher = new Fetcher(this._session);
log.timeEnd(status);
}

Expand Down
2 changes: 1 addition & 1 deletion lighthouse-core/gather/driver.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class Driver {
defaultSession = this;

// eslint-disable-next-line no-invalid-this
fetcher = new Fetcher(this.defaultSession, this.executionContext);
fetcher = new Fetcher(this.defaultSession);

/**
* @param {Connection} connection
Expand Down
245 changes: 35 additions & 210 deletions lighthouse-core/gather/fetcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,124 +6,56 @@
'use strict';

/**
* @fileoverview Fetcher is a utility for making requests within the context of the page.
* Requests can circumvent CORS, and so are good for fetching source maps that may be hosted
* on a different origin.
* @fileoverview Fetcher is a utility for making requests to any arbitrary resource,
* ignoring normal browser constraints such as CORS.
*/

/* global document */
/* global fetch */

/** @typedef {{content: string|null, status: number|null}} FetchResponse */

const log = require('lighthouse-logger');
const {getBrowserVersion} = require('./driver/environment.js');

class Fetcher {
Copy link
Member

@adamraine adamraine Apr 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need to use a class anymore, the state is just whatever session we pass into the constructor. We could refactor this to be more function like the rest of our helper files in lighthouse-core/gather/driver. This way we don't need to pass around a fetcher object on the gather context.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should do this in a different PR.

/**
* @param {LH.Gatherer.FRProtocolSession} session
* @param {import('./driver/execution-context.js')} executionContext
*/
constructor(session, executionContext) {
constructor(session) {
this.session = session;
this.executionContext = executionContext;
/** @type {Map<string, (event: LH.Crdp.Fetch.RequestPausedEvent) => void>} */
this._onRequestPausedHandlers = new Map();
this._onRequestPaused = this._onRequestPaused.bind(this);
this._enabled = false;
}

/**
* Chrome M92 and above:
* We use `Network.loadNetworkResource` to fetch each resource.
*
* Chrome <M92:
* The Fetch domain accepts patterns for controlling what requests are intercepted, but we
* enable the domain for all patterns and filter events at a lower level to support multiple
* concurrent usages. Reasons for this:
*
* 1) only one set of patterns may be applied for the entire domain.
* 2) every request that matches the patterns are paused and only resumes when certain Fetch
* commands are sent. So a listener of the `Fetch.requestPaused` event must either handle
* the requests it cares about, or explicitly allow them to continue.
* 3) if multiple commands to continue the same request are sent, protocol errors occur.
* Fetches any resource using the network directly.
*
* So instead we have one global `Fetch.enable` / `Fetch.requestPaused` pair, and allow specific
* urls to be intercepted via `fetcher._setOnRequestPausedHandler`.
*/
async enable() {
if (this._enabled) return;

this._enabled = true;
await this.session.sendCommand('Fetch.enable', {
patterns: [{requestStage: 'Request'}, {requestStage: 'Response'}],
});
await this.session.on('Fetch.requestPaused', this._onRequestPaused);
}

async disable() {
if (!this._enabled) return;

this._enabled = false;
await this.session.off('Fetch.requestPaused', this._onRequestPaused);
await this.session.sendCommand('Fetch.disable');
this._onRequestPausedHandlers.clear();
}

/**
* @param {string} url
* @param {(event: LH.Crdp.Fetch.RequestPausedEvent) => void} handler
*/
async _setOnRequestPausedHandler(url, handler) {
this._onRequestPausedHandlers.set(url, handler);
}

/**
* @param {LH.Crdp.Fetch.RequestPausedEvent} event
* @param {{timeout: number}=} options timeout is in ms
* @return {Promise<FetchResponse>}
*/
_onRequestPaused(event) {
const handler = this._onRequestPausedHandlers.get(event.request.url);
if (handler) {
handler(event);
} else {
// Nothing cares about this URL, so continue.
this.session.sendCommand('Fetch.continueRequest', {requestId: event.requestId}).catch(err => {
log.error('Fetcher', `Failed to continueRequest: ${err.message}`);
});
async fetchResource(url, options = {timeout: 2_000}) {
// In Lightrider, `Network.loadNetworkResource` is not implemented, but fetch
// is configured to work for any resource.
if (global.isLightrider) {
return this._wrapWithTimeout(this._fetchWithFetchApi(url), options.timeout);
}
}

/**
* `Network.loadNetworkResource` was introduced in M88.
* The long timeout bug with `IO.read` was fixed in M92:
* https://bugs.chromium.org/p/chromium/issues/detail?id=1191757
* Lightrider has a bug forcing us to use the old version for now:
* https://docs.google.com/document/d/1V-DxgsOFMPxUuFrdGPQpyiCqSljvgNlOqXCtqDtd0b8/edit?usp=sharing&resourcekey=0-aIaIqcHFKG-0dX4MAudBEw
* @return {Promise<boolean>}
*/
async shouldUseLegacyFetcher() {
const {milestone} = await getBrowserVersion(this.session);
return milestone < 92 || Boolean(global.isLightrider);
return this._fetchResourceOverProtocol(url, options);
}

/**
* Requires that `fetcher.enable` has been called.
*
* Fetches any resource in a way that circumvents CORS.
*
* @param {string} url
* @param {{timeout: number}=} options timeout is in ms
* @return {Promise<FetchResponse>}
*/
async fetchResource(url, options = {timeout: 2_000}) {
if (!this._enabled) {
throw new Error('Must call `enable` before using fetchResource');
}
async _fetchWithFetchApi(url) {
// eslint-disable-next-line no-undef
connorjclark marked this conversation as resolved.
Show resolved Hide resolved
const response = await fetch(url);

if (await this.shouldUseLegacyFetcher()) {
return this._fetchResourceIframe(url, options);
}
let content = null;
try {
content = await response.text();
} catch {}

return this._fetchResourceOverProtocol(url, options);
return {
content,
status: response.status,
};
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

iframe fetcher suppressed the response content if the status code was not 2xx. (note: this was not a mistake, even if cached 304 response was received JS fetch sees the cached 2xx response code. and by default redirects are followed and the status code is the final response)

Should we consider suppressing the response content here if the status code is not 2xx (like iframe fetcher did)? And also for Network.loadNetworkResponse?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the browser still work with robots.txt or source maps that have a non-2xx status code?

Copy link
Collaborator Author

@connorjclark connorjclark May 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the status code matters for source maps.

for robots.txt yes it needs to be 2xx https://developers.google.com/search/docs/advanced/robots/robots_txt#http-status-codes and the audit already checks this

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should just return anything we get + the status code, and leave it to each use case to check the status if they care. we should be good as it is.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should just return anything we get + the status code, and leave it to each use case to check the status if they care. we should be good as it is.

Yeah SGTM

}

/**
Expand Down Expand Up @@ -172,37 +104,14 @@ class Fetcher {
};
}

/**
* @param {string} requestId
* @return {Promise<string>}
*/
async _resolveResponseBody(requestId) {
const responseBody = await this.session.sendCommand('Fetch.getResponseBody', {requestId});
if (responseBody.base64Encoded) {
return Buffer.from(responseBody.body, 'base64').toString();
}
return responseBody.body;
}

/**
* @param {string} url
* @param {{timeout: number}} options timeout is in ms
* @return {Promise<FetchResponse>}
*/
async _fetchResourceOverProtocol(url, options) {
const startTime = Date.now();

/** @type {NodeJS.Timeout} */
let timeoutHandle;
const timeoutPromise = new Promise((_, reject) => {
timeoutHandle = setTimeout(reject, options.timeout, new Error('Timed out fetching resource'));
});

const responsePromise = this._loadNetworkResource(url);

/** @type {{stream: LH.Crdp.IO.StreamHandle|null, status: number|null}} */
const response = await Promise.race([responsePromise, timeoutPromise])
.finally(() => clearTimeout(timeoutHandle));
const response = await this._wrapWithTimeout(this._loadNetworkResource(url), options.timeout);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_readIOStream has its own weird timeout system, would it be possible to remove that and use _wrapWithTimeout over _fetchResourceOverProtocol?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it wouldn't be the same, unless we made the work _readIOStream does use a cancel-able promise. even if the wrapped timer timed out the work would keep going


const isOk = response.status && response.status >= 200 && response.status <= 299;
if (!response.stream || !isOk) return {status: response.status, content: null};
Expand All @@ -213,105 +122,21 @@ class Fetcher {
}

/**
* Fetches resource by injecting an iframe into the page.
* @param {string} url
* @param {{timeout: number}} options timeout is in ms
* @return {Promise<FetchResponse>}
* @template T
* @param {Promise<T>} promise
* @param {number} ms
*/
async _fetchResourceIframe(url, options) {
/** @type {Promise<FetchResponse>} */
const requestInterceptionPromise = new Promise((resolve, reject) => {
/** @param {LH.Crdp.Fetch.RequestPausedEvent} event */
const handlerAsync = async event => {
const {requestId, responseStatusCode} = event;

// The first requestPaused event is for the request stage. Continue it.
if (!responseStatusCode) {
// Remove cookies so we aren't buying stuff on Amazon.
const headers = Object.entries(event.request.headers)
.filter(([name]) => name !== 'Cookie')
.map(([name, value]) => {
return {name, value};
});

await this.session.sendCommand('Fetch.continueRequest', {
requestId,
headers,
});
return;
}

if (responseStatusCode >= 200 && responseStatusCode <= 299) {
resolve({
status: responseStatusCode,
content: await this._resolveResponseBody(requestId),
});
} else {
resolve({status: responseStatusCode, content: null});
}

// Fail the request (from the page's perspective) so that the iframe never loads.
await this.session.sendCommand('Fetch.failRequest', {requestId, errorReason: 'Aborted'});
};
this._setOnRequestPausedHandler(url, event => handlerAsync(event).catch(reject));
});

/**
* @param {string} src
*/
/* c8 ignore start */
function injectIframe(src) {
const iframe = document.createElement('iframe');
// Try really hard not to affect the page.
iframe.style.display = 'none';
iframe.style.visibility = 'hidden';
iframe.style.position = 'absolute';
iframe.style.top = '-1000px';
iframe.style.left = '-1000px';
iframe.style.width = '1px';
iframe.style.height = '1px';
iframe.src = src;
iframe.onload = iframe.onerror = () => {
iframe.remove();
iframe.onload = null;
iframe.onerror = null;
};
document.body.appendChild(iframe);
}
/* c8 ignore stop */

async _wrapWithTimeout(promise, ms) {
/** @type {NodeJS.Timeout} */
let asyncTimeout;
/** @type {Promise<never>} */
let timeoutHandle;
const timeoutPromise = new Promise((_, reject) => {
asyncTimeout = setTimeout(reject, options.timeout, new Error('Timed out fetching resource.'));
});

const racePromise = Promise.race([
timeoutPromise,
requestInterceptionPromise,
]).finally(() => clearTimeout(asyncTimeout));

// Temporarily disable auto-attaching for this iframe.
await this.session.sendCommand('Target.setAutoAttach', {
autoAttach: false,
waitForDebuggerOnStart: false,
});

const injectionPromise = this.executionContext.evaluate(injectIframe, {
args: [url],
useIsolation: true,
timeoutHandle = setTimeout(reject, ms, new Error('Timed out fetching resource'));
});

const [fetchResult] = await Promise.all([racePromise, injectionPromise]);

await this.session.sendCommand('Target.setAutoAttach', {
flatten: true,
autoAttach: true,
waitForDebuggerOnStart: true,
});

return fetchResult;
/** @type {Promise<T>} */
const wrappedPromise = await Promise.race([promise, timeoutPromise])
.finally(() => clearTimeout(timeoutHandle));
return wrappedPromise;
}
}

Expand Down
11 changes: 0 additions & 11 deletions lighthouse-core/gather/gather-runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,6 @@ class GatherRunner {
const resetStorage = !options.settings.disableStorageReset;
if (resetStorage) await storage.clearDataForOrigin(session, options.requestedUrl);

// Disable fetcher, in case a gatherer enabled it.
// This cleanup should be removed once the only usage of
// fetcher (fetching arbitrary URLs) is replaced by new protocol support.
await driver.fetcher.disable();

await driver.disconnect();
} catch (err) {
// Ignore disconnecting error if browser was already closed.
Expand Down Expand Up @@ -524,12 +519,6 @@ class GatherRunner {
await GatherRunner.populateBaseArtifacts(passContext);
isFirstPass = false;
}

// Disable fetcher for every pass, in case a gatherer enabled it.
// Noop if fetcher was never enabled.
// This cleanup should be removed once the only usage of
// fetcher (fetching arbitrary URLs) is replaced by new protocol support.
await driver.fetcher.disable();
}

await GatherRunner.disposeDriver(driver, options);
Expand Down
29 changes: 0 additions & 29 deletions lighthouse-core/gather/gatherers/seo/robots-txt.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,6 @@

const FRGatherer = require('../../../fraggle-rock/gather/base-gatherer.js');

/* global fetch, location */

/** @return {Promise<LH.Artifacts['RobotsTxt']>} */
/* c8 ignore start */
async function getRobotsTxtContent() {
try {
const response = await fetch(new URL('/robots.txt', location.href).href);
if (!response.ok) {
return {status: response.status, content: null};
}

const content = await response.text();
return {status: response.status, content};
} catch (err) {
return {status: null, content: null, errorMessage: err.message};
}
}
/* c8 ignore stop */

class RobotsTxt extends FRGatherer {
/** @type {LH.Gatherer.GathererMeta} */
meta = {
Expand All @@ -37,18 +18,8 @@ class RobotsTxt extends FRGatherer {
* @return {Promise<LH.Artifacts['RobotsTxt']>}
*/
async getArtifact(passContext) {
// Iframe fetcher still has issues with CSPs.
// Only use the fetcher if we are fetching over the protocol.
if (await passContext.driver.fetcher.shouldUseLegacyFetcher()) {
return passContext.driver.executionContext.evaluate(getRobotsTxtContent, {
args: [],
useIsolation: true,
});
}

const {finalUrl} = passContext.baseArtifacts.URL;
const robotsUrl = new URL('/robots.txt', finalUrl).href;
await passContext.driver.fetcher.enable();
return passContext.driver.fetcher.fetchResource(robotsUrl)
.catch(err => ({status: null, content: null, errorMessage: err.message}));
}
Expand Down
1 change: 0 additions & 1 deletion lighthouse-core/gather/gatherers/source-maps.js
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ class SourceMaps extends FRGatherer {
* @return {Promise<LH.Artifacts['SourceMaps']>}
*/
async getArtifact(context) {
await context.driver.fetcher.enable();
const eventProcessPromises = this._scriptParsedEvents
.map((event) => this._retrieveMapFromScriptParsedEvent(context.driver, event));
return Promise.all(eventProcessPromises);
Expand Down
Loading