Files
Images-Scrapper-JS/main.js

154 lines
4.9 KiB
JavaScript

import express from "express";
import * as cheerio from "cheerio";
import { randomUUID } from "crypto";
import axios from "axios";
import archiver from "archiver";
import mime from "mime";
import { rateLimit } from "express-rate-limit";
import "dotenv/config";
const app = express();
const PORT = 3000;
const useRateLimit = process.env.USE_RATE_LIMIT === "true";
const getImagesLimiter = rateLimit({
windowMs: 60 * 1000, // 1 min
limit: 100,
standardHeaders: "draft-8",
legacyHeaders: false,
});
const downloadLimiter = rateLimit({
windowMs: 60 * 1000, // 1 min
limit: 5,
standardHeaders: "draft-8",
legacyHeaders: false,
});
const headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
};
let cachedImagesUrls = {};
app.set("trust proxy", 1);
app.use(express.static("public"));
if (useRateLimit) {
app.use("/api/getImagesURL", getImagesLimiter);
app.use("/api/downloadImages", downloadLimiter);
}
app.listen(PORT, () => {
console.log(`Server launched on http://localhost:${PORT}`);
});
app.get("/api/getImagesURL", async (req, res) => {
try {
let { q, offset = 1, count = 1000, smart = true } = req.query;
offset = Number(offset);
count = Number(count);
smart = smart === "true";
if (!q) {
return res.status(400).send("Missing 'q' parameter");
}
let imagesUrls = [];
let noNewCount = 0;
do {
const url = `https://www.bing.com/images/async?q=${encodeURIComponent(q)}&first=${String(offset)}`;
const response = await fetch(url, { headers: headers });
const html = await response.text();
const urls = extractImageUrls(html);
offset += urls.length;
let newImageCount = 0;
for (const url of urls) {
if (!imagesUrls.includes(url)) {
if (smart == true) {
try {
const response = await fetch(url, { method: "HEAD", headers: headers });
const contentType = response.headers.get("content-type");
if (contentType && contentType.startsWith("image/")) {
imagesUrls.push(url);
newImageCount += 1;
}
} catch {}
} else {
imagesUrls.push(url);
newImageCount += 1;
}
}
}
noNewCount = newImageCount == 0 ? noNewCount + 1 : 0;
if (noNewCount > 32) {
break;
}
} while (imagesUrls.length < count);
imagesUrls = imagesUrls.slice(0, count);
const uuid = randomUUID();
cachedImagesUrls[uuid] = imagesUrls;
setTimeout(
() => {
delete cachedImagesUrls[uuid];
},
10 * 60 * 1000,
);
res.send({ uuid: uuid, urls: imagesUrls });
} catch (error) {
console.error(error);
}
});
app.get("/api/downloadImages", async (req, res) => {
try {
let { uuid } = req.query;
const imagesUrls = cachedImagesUrls[uuid];
if (!imagesUrls) {
return res.status(400).send("Invalid 'uuid'");
}
res.setHeader("Content-Type", "application/zip");
res.setHeader("Content-Disposition", `attachment; filename=${uuid}.zip`);
const archive = archiver("zip", { zlib: { level: 9 } });
archive.pipe(res);
let errorNumber = 0;
for (let i = 0; i < imagesUrls.length; i++) {
const url = imagesUrls[i];
try {
const response = await axios.get(url, { responseType: "stream", timeout: 5000, headers: headers });
const contentType = response.headers["content-type"];
const extension = mime.getExtension(contentType) || url.split(".").pop();
archive.append(response.data, { name: `image-${i + 1 - errorNumber}.${extension}` });
//console.log(`image downloaded ${url}`);
} catch (error) {
errorNumber += 1;
console.warn(`Unable to download image ${url} : ${error}`);
}
}
await archive.finalize();
} catch (error) {
console.error(error);
}
});
function extractImageUrls(html) {
const document = cheerio.load(html);
const urls = [];
const imagesTags = document("a.iusc").toArray();
for (const tag of imagesTags) {
const metadataString = document(tag).attr("m");
const imageData = JSON.parse(metadataString);
let url = new URL(imageData.murl);
url = url.origin + url.pathname;
urls.push(url);
}
return urls;
}