2019-07-22 00:38:56 +02:00

103 lines
2.6 KiB
JavaScript

import Metascraper from 'metascraper'
import fetch from 'node-fetch'
import fs from 'fs'
import path from 'path'
import { ApolloError } from 'apollo-server'
import isEmpty from 'lodash/isEmpty'
import isArray from 'lodash/isArray'
import mergeWith from 'lodash/mergeWith'
const error = require('debug')('embed:error')
const metascraper = Metascraper([
require('metascraper-author')(),
require('metascraper-date')(),
require('metascraper-description')(),
require('metascraper-image')(),
require('metascraper-lang')(),
require('metascraper-lang-detector')(),
require('metascraper-logo')(),
// require('metascraper-clearbit-logo')(),
require('metascraper-publisher')(),
require('metascraper-title')(),
require('metascraper-url')(),
require('metascraper-audio')(),
require('metascraper-soundcloud')(),
require('metascraper-video')(),
require('metascraper-youtube')(),
// require('./rules/metascraper-embed')()
])
let oEmbedProvidersFile = fs.readFileSync(path.join(__dirname, './providers.json'), 'utf8')
// some providers allow a format parameter
// we need JSON
oEmbedProvidersFile = oEmbedProvidersFile.replace('{format}', 'json')
const oEmbedProviders = JSON.parse(oEmbedProvidersFile)
const fetchEmbed = async url => {
const provider = oEmbedProviders.find(provider => {
return provider.provider_url.includes(url.hostname)
})
if (!provider) return {}
const {
endpoints: [endpoint],
} = provider
const endpointUrl = new URL(endpoint.url)
endpointUrl.searchParams.append('url', url.href)
endpointUrl.searchParams.append('format', 'json')
let json
try {
const response = await fetch(endpointUrl)
json = await response.json()
} catch (err) {
error(`Error fetching embed data: ${err.message}`)
return {}
}
return {
type: json.type,
html: json.html,
author: json.author_name,
date: json.upload_date,
sources: ['oembed'],
}
}
const fetchResource = async url => {
const response = await fetch(url)
const html = await response.text()
const resource = await metascraper({ html, url: url.href })
return {
sources: ['resource'],
...resource,
}
}
export default async function scrape(url) {
url = new URL(url)
if (url.hostname === 'youtu.be') {
// replace youtu.be to get proper results
url.hostname = 'youtube.com'
}
const [meta, embed] = await Promise.all([fetchResource(url), fetchEmbed(url)])
const output = mergeWith(meta, embed, (objValue, srcValue) => {
if (isArray(objValue)) {
return objValue.concat(srcValue)
}
})
if (isEmpty(output)) {
throw new ApolloError('Not found', 'NOT_FOUND')
}
return {
type: 'link',
...output,
}
}