mirror of
https://github.com/IT4Change/Ocelot-Social.git
synced 2025-12-13 07:45:56 +00:00
173 lines
4.4 KiB
JavaScript
173 lines
4.4 KiB
JavaScript
const metascraper = require('metascraper')([
|
|
require('metascraper-author')(),
|
|
require('metascraper-date')(),
|
|
require('metascraper-description')(),
|
|
require('metascraper-image')(),
|
|
require('metascraper-lang')(),
|
|
require('metascraper-lang-detector')(),
|
|
require('metascraper-logo')(),
|
|
require('metascraper-logo-favicon')(),
|
|
// require('metascraper-clearbit-logo')(),
|
|
require('metascraper-publisher')(),
|
|
require('metascraper-title')(),
|
|
require('metascraper-url')(),
|
|
require('metascraper-audio')(),
|
|
require('metascraper-soundcloud')(),
|
|
require('metascraper-video')(),
|
|
require('metascraper-youtube')()
|
|
|
|
// require('./rules/metascraper-embed')()
|
|
])
|
|
const { ApolloError } = require('apollo-server')
|
|
const parseUrl = require('url')
|
|
|
|
const got = require('got')
|
|
const request = require('request-promise-native')
|
|
const find = require('lodash/find')
|
|
const isEmpty = require('lodash/isEmpty')
|
|
const each = require('lodash/each')
|
|
const isArray = require('lodash/isArray')
|
|
const mergeWith = require('lodash/mergeWith')
|
|
const urlParser = require('url')
|
|
|
|
// quick in memory cache
|
|
let cache = {}
|
|
|
|
let oEmbedProviders = []
|
|
const getEmbedProviders = async () => {
|
|
let providers = await request('https://oembed.com/providers.json')
|
|
providers = JSON.parse(providers)
|
|
oEmbedProviders = providers
|
|
return providers
|
|
}
|
|
getEmbedProviders()
|
|
|
|
const removeEmptyAttrs = obj => {
|
|
let output = {}
|
|
each(obj, (o, k) => {
|
|
if (!isEmpty(o)) {
|
|
output[k] = o
|
|
}
|
|
})
|
|
return output
|
|
}
|
|
|
|
const scraper = {
|
|
async fetch(targetUrl) {
|
|
if (targetUrl.indexOf('//youtu.be/')) {
|
|
// replace youtu.be to get proper results
|
|
targetUrl = targetUrl.replace('//youtu.be/', '//youtube.com/')
|
|
}
|
|
|
|
if (cache[targetUrl]) {
|
|
return cache[targetUrl]
|
|
}
|
|
|
|
const url = parseUrl.parse(targetUrl, true)
|
|
|
|
let meta = {}
|
|
let embed = {}
|
|
|
|
// only get data from requested services
|
|
await Promise.all([
|
|
new Promise(async (resolve, reject) => {
|
|
try {
|
|
meta = await scraper.fetchMeta(targetUrl)
|
|
resolve()
|
|
} catch(err) {
|
|
if (process.env.DEBUG) {
|
|
console.error(`ERROR at fetchMeta | ${err.message}`)
|
|
}
|
|
resolve()
|
|
}
|
|
}),
|
|
new Promise(async (resolve, reject) => {
|
|
try {
|
|
embed = await scraper.fetchEmbed(targetUrl)
|
|
resolve()
|
|
} catch(err) {
|
|
if (process.env.DEBUG) {
|
|
console.error(`ERROR at fetchEmbed | ${err.message}`)
|
|
}
|
|
resolve()
|
|
}
|
|
})
|
|
])
|
|
|
|
const output = mergeWith(
|
|
meta,
|
|
embed,
|
|
(objValue, srcValue) => {
|
|
if (isArray(objValue)) {
|
|
return objValue.concat(srcValue);
|
|
}
|
|
}
|
|
)
|
|
|
|
if (isEmpty(output)) {
|
|
throw new ApolloError('Not found', 'NOT_FOUND')
|
|
}
|
|
|
|
// fix youtube start parameter
|
|
const YouTubeStartParam = url.query.t || url.query.start
|
|
if (output.publisher === 'YouTube' && YouTubeStartParam) {
|
|
output.embed = output.embed.replace('?feature=oembed', `?feature=oembed&start=${YouTubeStartParam}`)
|
|
output.url += `&start=${YouTubeStartParam}`
|
|
}
|
|
|
|
// write to cache
|
|
cache[targetUrl] = output
|
|
|
|
return output
|
|
},
|
|
async fetchEmbed(targetUrl) {
|
|
const url = urlParser.parse(targetUrl)
|
|
const embedMeta = find(oEmbedProviders, provider => {
|
|
return provider.provider_url.indexOf(url.hostname) >= 0
|
|
})
|
|
if (!embedMeta) {
|
|
return {}
|
|
}
|
|
const embedUrl = embedMeta.endpoints[0].url.replace('{format}', 'json')
|
|
|
|
let data
|
|
try {
|
|
data = await request(`${embedUrl}?url=${targetUrl}`)
|
|
data = JSON.parse(data)
|
|
} catch (err) {
|
|
data = await request(`${embedUrl}?url=${targetUrl}&format=json`)
|
|
data = JSON.parse(data)
|
|
}
|
|
if (data) {
|
|
let output = {
|
|
type: data.type || 'link',
|
|
embed: data.html,
|
|
author: data.author_name,
|
|
date: data.upload_date ? new Date(data.upload_date).toISOString() : null
|
|
}
|
|
|
|
output.sources = ['oembed']
|
|
|
|
return output
|
|
}
|
|
return {}
|
|
},
|
|
async fetchMeta(targetUrl) {
|
|
|
|
// const parsedURL = urlParser.parse(targetUrl)
|
|
// console.log(parsedURL)
|
|
|
|
// get from cach
|
|
|
|
const { body: html, url } = await got(targetUrl)
|
|
const metadata = await metascraper({ html, url })
|
|
|
|
metadata.sources = ['resource']
|
|
metadata.type = 'link'
|
|
|
|
return metadata
|
|
}
|
|
}
|
|
|
|
module.exports = scraper
|