Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract Amazon image URLs from the page's embedded JavaScript #86

Merged
merged 15 commits into from
Oct 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .eslintrc.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ module.exports = {
],
'quotes': [
'error',
'single'
'single',
{
avoidEscape: true,
}
],
'semi': [
'error',
Expand Down
14 changes: 14 additions & 0 deletions src/lib/util/json.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
export function safeParseJSON<T>(jsonText: string): T | undefined;
export function safeParseJSON<T>(jsonText: string, errorMessage: string): T;
export function safeParseJSON<T>(jsonText: string, errorMessage?: string): T | undefined {
try {
return JSON.parse(jsonText);
} catch (err) {
if (errorMessage) {
// If an error message is defined, we should re-throw with a custom
// error.
throw new Error(errorMessage + ': ' + err);
}
return undefined;
}
}
149 changes: 122 additions & 27 deletions src/mb_enhanced_cover_art_uploads/providers/amazon.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,29 @@
import { qs, qsa, qsMaybe } from '@lib/util/dom';
import { LOGGER } from '@lib/logging/logger';
import { parseDOM, qsa, qsMaybe } from '@lib/util/dom';
import { safeParseJSON } from '@lib/util/json';

import type { CoverArt } from './base';
import { ArtworkTypeIDs, CoverArtProvider } from './base';

const PLACEHOLDER_IMG_REGEX = /01RmK(?:\+|%2B)J4pJL/;

// Incomplete, only what we need
interface AmazonImage {
hiRes: string | null; // URL of the largest version, can still be maximised by IMU
thumb: string; // this kind of URL can also be extracted from the sidebar (DOM)
large: string; // maximised version of `thumb`, can not be further maximised by IMU
variant: string; // see mapping below
}

const VARIANT_TYPE_MAPPING: Record<string, ArtworkTypeIDs | undefined> = {
MAIN: ArtworkTypeIDs.Front,
FRNT: ArtworkTypeIDs.Front, // not seen in use so far, usually MAIN is used for front covers
BACK: ArtworkTypeIDs.Back,
SIDE: ArtworkTypeIDs.Spine, // not seen in use so far
// PT01: ArtworkTypeIDs.Other,
// See https://sellercentral.amazon.com/gp/help/external/JV4FNMT7563SF5F for further details
};

export class AmazonProvider extends CoverArtProvider {
supportedDomains = [
'amazon.ca', 'amazon.cn', 'amazon.de', 'amazon.es', 'amazon.fr',
Expand All @@ -16,41 +35,117 @@ export class AmazonProvider extends CoverArtProvider {
}

name = 'Amazon';
urlRegex = /\/(?:gp\/product|dp)\/([A-Za-z0-9]{10})(?:\/|$)/;
urlRegex = /\/(?:gp\/product|dp)\/([A-Za-z0-9]{10})(?:\/|\?|$)/;

async findImages(url: URL): Promise<CoverArt[]> {
const pageDom = await this.fetchPageDOM(url);
const pageContent = await this.fetchPage(url);
const pageDom = parseDOM(pageContent);

if (qsMaybe('#digitalMusicProductImage_feature_div', pageDom) !== null) {
// Streaming/MP3 product
return this.#extractFromStreamingProduct(pageDom);
// Look for products which only have a single image, the front cover.
const frontCover = this.#extractFrontCover(pageDom);
if (frontCover) {
return [frontCover];
}

// Thumbnails in the sidebar, IMU will maximise
const imgs = qsa<HTMLImageElement>('#altImages img', pageDom);
const covers: CoverArt[] = imgs
// Filter out placeholder images.
.filter((img) => !PLACEHOLDER_IMG_REGEX.test(img.src))
.map((img) => {
return { url: new URL(img.src) };
});
// For physical products we have to extract the embedded JS from the
// page source to get all images in their highest available resolution.
let covers = this.extractFromEmbeddedJS(pageContent);
if (!covers) {
// Use the (smaller) image thumbnails in the sidebar as a fallback,
// although it might not contain all of them. IMU will maximise,
// but the results are still inferior to the embedded hires images.
covers = this.extractFromThumbnailSidebar(pageDom);
}
if (!covers.length) {
// Handle physical audiobooks, the above extractors fail for those.
LOGGER.warn('Found no release images, trying to find an Amazon (audio)book gallery…');
covers = this.extractFromEmbeddedJSGallery(pageContent) ?? /* istanbul ignore next: Should never happen */[];
}

// Filter out placeholder images.
return covers.filter((img) => !PLACEHOLDER_IMG_REGEX.test(img.url.href));
}

#extractFrontCover(pageDom: Document): CoverArt | undefined {
const frontCoverSelectors = [
'#digitalMusicProductImage_feature_div > img', // Streaming/MP3 products
'img#main-image', // Audible products
];

for (const selector of frontCoverSelectors) {
const productImage = qsMaybe<HTMLImageElement>(selector, pageDom);
if (productImage) {
// Only returning the thumbnail, IMU will maximise
return {
url: new URL(productImage.src),
types: [ArtworkTypeIDs.Front],
};
}
}

// Different product type (or no image found)
return;
}

extractFromEmbeddedJS(pageContent: string): CoverArt[] | undefined {
const embeddedImages = pageContent.match(/^'colorImages': { 'initial': (.+)},$/m)?.[1];
if (!embeddedImages) {
LOGGER.warn('Failed to extract Amazon images from the embedded JS, falling back to thumbnails');
return;
}

const imgs = safeParseJSON<AmazonImage[]>(embeddedImages);
if (!Array.isArray(imgs)) {
LOGGER.error("Failed to parse Amazon's embedded JS, falling back to thumbnails");
return;
}

return imgs.map((img) => {
// `img.hiRes` is probably only `null` when `img.large` is the placeholder image?
return this.#convertVariant({ url: img.hiRes ?? img.large, variant: img.variant });
});
}

extractFromEmbeddedJSGallery(pageContent: string): CoverArt[] | undefined {
const embeddedGallery = pageContent.match(/^'imageGalleryData' : (.+),$/m)?.[1];
if (!embeddedGallery) {
LOGGER.warn('Failed to extract Amazon images from the embedded JS (audio)book gallery');
return;
}

// We don't know anything about the types of these images, but we can
// probably assume the first image is the front cover.
if (covers.length) {
covers[0].types = [ArtworkTypeIDs.Front];
const imgs = safeParseJSON<Array<{ mainUrl: string }>>(embeddedGallery);
if (!Array.isArray(imgs)) {
LOGGER.error("Failed to parse Amazon's embedded JS (audio)book gallery");
return;
}

return covers;
// Amazon embeds no image variants on these pages, so we don't know the types
return imgs.map((img) => ({ url: new URL(img.mainUrl) }));
}

#extractFromStreamingProduct(doc: Document): CoverArt[] {
const img = qs<HTMLImageElement>('#digitalMusicProductImage_feature_div > img', doc);
// For MP3/Streaming releases, we know the cover is the front one.
// Only returning the thumbnail, IMU will maximise
return [{
url: new URL(img.src),
types: [ArtworkTypeIDs.Front],
}];
extractFromThumbnailSidebar(pageDom: Document): CoverArt[] {
const imgs = qsa<HTMLImageElement>('#altImages img', pageDom);
return imgs.map((img) => {
const dataThumbAction = img.closest('span[data-thumb-action]')?.getAttribute('data-thumb-action');
const variant = dataThumbAction && safeParseJSON<{ variant: string }>(dataThumbAction)?.variant;

/* istanbul ignore if: Difficult to exercise */
if (!variant) {
LOGGER.warn('Failed to extract the Amazon image variant code from the JSON attribute');
}

return this.#convertVariant({ url: img.src, variant });
});
}

#convertVariant(cover: { url: string; variant?: string | null }): CoverArt {
const url = new URL(cover.url);
const type = cover.variant && VARIANT_TYPE_MAPPING[cover.variant];
LOGGER.debug(`${url.href} has the Amazon image variant code '${cover.variant}'`);

if (type) {
return { url, types: [type] };
}
return { url };
}
}
6 changes: 3 additions & 3 deletions src/mb_enhanced_cover_art_uploads/providers/base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,13 @@ export abstract class CoverArtProvider {
return !!id && id === this.extractId(redirectedUrl);
}

async fetchPageDOM(url: URL): Promise<Document> {
async fetchPage(url: URL): Promise<string> {
const resp = await gmxhr(url);
if (resp.finalUrl !== url.href && !this.isSafeRedirect(url, new URL(resp.finalUrl))) {
throw new Error(`Refusing to extract images from ${this.name} provider because the original URL redirected to ${resp.finalUrl}, which may be a different release. If this redirected URL is correct, please retry with ${resp.finalUrl} directly.`);
}

return parseDOM(resp.responseText);
return resp.responseText;
}
}

Expand Down Expand Up @@ -113,7 +113,7 @@ export abstract class HeadMetaPropertyProvider extends CoverArtProvider {
async findImages(url: URL): Promise<CoverArt[]> {
// Find an image link from a HTML head meta property, maxurl will
// maximize it for us. Don't want to use the API because of OAuth.
const respDocument = await this.fetchPageDOM(url);
const respDocument = parseDOM(await this.fetchPage(url));
const coverElmt = qs<HTMLMetaElement>('head > meta[property="og:image"]', respDocument);
return [{
url: new URL(coverElmt.content),
Expand Down
3 changes: 2 additions & 1 deletion src/mb_enhanced_cover_art_uploads/providers/discogs.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { assert, assertHasValue } from '@lib/util/assert';
import { safeParseJSON } from '@lib/util/json';
import { gmxhr } from '@lib/util/xhr';
import type { CoverArt } from './base';
import { CoverArtProvider } from './base';
Expand Down Expand Up @@ -95,7 +96,7 @@ export class DiscogsProvider extends CoverArtProvider {
}));
const resp = await gmxhr(`https://www.discogs.com/internal/release-page/api/graphql?operationName=ReleaseAllImages&variables=${variables}&extensions=${extensions}`);

const metadata = JSON.parse(resp.responseText) as DiscogsImages;
const metadata = safeParseJSON<DiscogsImages>(resp.responseText, 'Invalid response from Discogs API');
assertHasValue(metadata.data.release, 'Discogs release does not exist');
const responseId = metadata.data.release.discogsId.toString();
assert(typeof responseId === 'undefined' || responseId === releaseId, `Discogs returned wrong release: Requested ${releaseId}, got ${responseId}`);
Expand Down
3 changes: 2 additions & 1 deletion src/mb_enhanced_cover_art_uploads/providers/qobuz.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { assert, assertHasValue } from '@lib/util/assert';
import { safeParseJSON } from '@lib/util/json';
import { gmxhr, HTTPResponseError } from '@lib/util/xhr';

import type { CoverArt } from './base';
Expand Down Expand Up @@ -65,7 +66,7 @@ export class QobuzProvider extends CoverArtProvider {
},
});

const metadata = JSON.parse(resp.responseText) as AlbumMetadata;
const metadata = safeParseJSON<AlbumMetadata>(resp.responseText, 'Invalid response from Qobuz API');
assert(metadata.id.toString() === id, `Qobuz returned wrong release: Requested ${id}, got ${metadata.id}`);

return metadata;
Expand Down
24 changes: 19 additions & 5 deletions src/mb_enhanced_cover_art_uploads/providers/tidal.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { assert, assertHasValue } from '@lib/util/assert';
import { safeParseJSON } from '@lib/util/json';
import { gmxhr } from '@lib/util/xhr';

import type { CoverArt } from './base';
Expand All @@ -13,6 +14,18 @@ import { ArtworkTypeIDs, CoverArtProvider } from './base';
// https://web.archive.org/web/20181015184006/https://listen.tidal.com/app.9dbb572e8121f8755b73.js
const APP_ID = 'CzET4vdadNUFQ5JU';

// Incomplete and not entirely correct, but good enough for our purposes.
interface AlbumMetadata {
rows: Array<{
modules: Array<{
album: {
id: number;
cover: string;
};
}>;
}>;
}

export class TidalProvider extends CoverArtProvider {
supportedDomains = ['tidal.com', 'listen.tidal.com', 'store.tidal.com'];
favicon = 'https://listen.tidal.com/favicon.ico';
Expand All @@ -28,7 +41,8 @@ export class TidalProvider extends CoverArtProvider {
'x-tidal-token': APP_ID,
},
});
this.#countryCode = JSON.parse(resp.responseText).countryCode;
const codeResponse = safeParseJSON<{ countryCode: string }>(resp.responseText, 'Invalid JSON response from Tidal API for country code');
this.#countryCode = codeResponse.countryCode;
}
assertHasValue(this.#countryCode, 'Cannot determine Tidal country');
return this.#countryCode;
Expand All @@ -45,12 +59,12 @@ export class TidalProvider extends CoverArtProvider {
'x-tidal-token': APP_ID,
},
});
const metadata = JSON.parse(resp.responseText);
const albumMetadata = metadata?.rows?.[0]?.modules?.[0]?.album;
const metadata = safeParseJSON<AlbumMetadata>(resp.responseText, 'Invalid response from Tidal API');
const albumMetadata = metadata.rows[0]?.modules?.[0]?.album;
assertHasValue(albumMetadata, 'Tidal API returned no album, 404?');
assert(albumMetadata.id?.toString() === albumId, `Tidal returned wrong release: Requested ${albumId}, got ${albumMetadata.id}`);
assert(albumMetadata.id.toString() === albumId, `Tidal returned wrong release: Requested ${albumId}, got ${albumMetadata.id}`);

const coverId = metadata?.rows?.[0]?.modules?.[0]?.album?.cover;
const coverId = albumMetadata.cover;
assertHasValue(coverId, 'Could not find cover in Tidal metadata');
return `https://resources.tidal.com/images/${coverId.replace(/-/g, '/')}/origin.jpg`;
}
Expand Down
3 changes: 2 additions & 1 deletion src/mb_enhanced_cover_art_uploads/providers/vgmdb.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { assert, assertHasValue } from '@lib/util/assert';
import { safeParseJSON } from '@lib/util/json';
import { gmxhr } from '@lib/util/xhr';

import type { CoverArt } from './base';
Expand Down Expand Up @@ -133,7 +134,7 @@ export class VGMdbProvider extends CoverArtProvider {
assertHasValue(id);
const apiUrl = `https://vgmdb.info/album/${id}?format=json`;
const apiResp = await gmxhr(apiUrl);
const metadata = JSON.parse(apiResp.responseText) as AlbumMetadata;
const metadata = safeParseJSON<AlbumMetadata>(apiResp.responseText, 'Invalid JSON response from vgmdb.info API');

assert(metadata.link === 'album/' + id, `VGMdb.info returned wrong release: Requested album/${id}, got ${metadata.link}`);

Expand Down
5 changes: 3 additions & 2 deletions src/mb_enhanced_cover_art_uploads/seeding/parameters.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { LOGGER } from '@lib/logging/logger';
import { safeParseJSON } from '@lib/util/json';
import type { CoverArt } from '../providers/base';

function encodeValue(value: unknown): string {
Expand All @@ -23,11 +24,11 @@ function decodeSingleKeyValue(key: string, value: string, images: CoverArt[]): v
if (keyName === 'url') {
images[imageIdx].url = new URL(value);
} else if (keyName === 'types') {
const types = JSON.parse(value);
const types = safeParseJSON(value);
if (!Array.isArray(types) || types.some((type) => typeof type !== 'number')) {
throw new Error(`Invalid 'types' parameter: ${value}`);
}
images[imageIdx].types = JSON.parse(value);
images[imageIdx].types = types;
} else {
images[imageIdx].comment = value;
}
Expand Down
21 changes: 21 additions & 0 deletions tests/lib/util/json.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { safeParseJSON } from '@lib/util/json';

describe('safe JSON parsing', () => {
it('returns parsed JSON if valid', () => {
const jsonText = '{"hello": ["world", "test"]}';
const result = safeParseJSON<Record<string, string[]>>(jsonText);

expect(result).toStrictEqual({'hello': ['world', 'test']});
});

it('returns undefined on invalid JSON', () => {
const result = safeParseJSON('{"""}');

expect(result).toBeUndefined();
});

it('throws on invalid JSON if custom message is set', () => {
expect(() => safeParseJSON('{"""}', 'custom message'))
.toThrowWithMessage(Error, /custom message/);
});
});
Loading