import fs from 'fs';
import { promisify } from 'util';
import cheerio from 'cheerio';
import dayjs from 'dayjs';
import customParseFormat from 'dayjs/plugin/customParseFormat.js';
import slugify from '@sindresorhus/slugify';
import path from 'path';
const HTML_DIR = './tumblr-data/posts/html';
const TARGET_DIR = '../pop.md/site/legacy';
const readdir = promisify(fs.readdir);
const readFile = promisify(fs.readFile);
const writeFile = promisify(fs.writeFile);
dayjs.extend(customParseFormat);
async function getTumblrHtmlFiles(dirName) {
const files = [];
const dirs = await readdir(dirName);
for (const file of dirs) {
files.push(`${dirName}/${file}`);
}
return files.sort();
}
function convertEmbedIntoLink(src) {
if (!src) {
return '';
}
const youTubeMatch = src.match(/youtube\.com\/embed\/([a-zA-Z0-9_]+)/);
if (youTubeMatch) {
return `https://www.youtube.com/watch?v=${youTubeMatch[1]}`;
}
return '';
}
function extractLinkFromCaption($) {
const children = $('.caption p').children();
if (children[0] && children[0].tagName === 'a') {
return $(children[0]).attr('href');
}
return null;
}
function extractTags($) {
const tags = [];
for (const rawTag of $('span.tag')) {
tags.push($(rawTag).text().replace(/\.|,/, ' ').trim().toLowerCase());
}
return tags;
}
function getCategoryFromTags(tags, link) {
if (tags.includes('cover')) {
return 'Cover';
} else if (
tags.includes('remix') ||
tags.includes('rework') ||
tags.includes('mashup')
) {
return 'Remix';
} else if (tags.includes('short film')) {
return 'Short Film';
} else if (tags.includes('music video') || tags.includes('musicvideo')) {
return 'Music Video';
} else if (
tags.includes('live') ||
tags.includes('live video') ||
tags.includes('live session') ||
tags.includes('they shoot music')
) {
return 'Live';
} else if (
tags.includes('song') ||
tags.includes('audio') ||
tags.includes('lyric video')
) {
return 'Song';
} else if (link.match(/soundcloud/)) {
return 'Song';
}
return '';
}
async function parseTumblrHtml(fileName) {
const html = await readFile(fileName);
const $ = cheerio.load(html);
const tags = extractTags($);
const rawMedia = $('iframe').attr('src');
const link = extractLinkFromCaption($) || convertEmbedIntoLink(rawMedia);
const rawCaption = $('.caption').text().trim();
const captionMatch =
rawCaption.match(
/\s*(.*?)\s+by\s+(.*?)(\s+from\s+(.*?))?\s*(\((.*?)\)\s*)?\.?\s*$/
) || [];
const mainArtistMatch = (captionMatch[2] || '').match(
/\s*(.*?)(\s+feat\.(.*))?$/
);
const mainArtist = (mainArtistMatch[1] || '').trim();
const trackName = (captionMatch[1] || '').trim();
const rawDate = $('#timestamp').text();
const date = dayjs(rawDate, ' MMMM Do, YYYY h:mmA ');
let targetFileName = date.unix();
if (mainArtist) {
targetFileName += `-${slugify(mainArtist)}`;
}
if (trackName) {
targetFileName += `-${slugify(trackName)}`;
}
targetFileName += '.md';
if (!rawCaption.trim()) {
return null;
}
return {
originalFileName: fileName,
originalCaption: rawCaption,
originalTags: tags,
originalDate: rawDate,
fileName: targetFileName,
permalink: `/post/${path.basename(fileName, '.html')}/index.html`,
artist: mainArtist,
album: (captionMatch[4] || '').trim(),
track: trackName,
date: date.format('YYYY-MM-DD'),
link,
tags: ['post', 'legacy'],
category: getCategoryFromTags(tags, link),
templateContent: rawCaption
.replace(mainArtist, (match) => (match ? `**${match}**` : match))
.replace(trackName, (match) => (match ? `[${match}](${link})` : match)),
};
}
async function parseTumblrHtmlFiles(files) {
const posts = [];
for (const file of files) {
const post = await parseTumblrHtml(file);
if (post) {
posts.push(post);
}
}
return posts;
}
function escape(value) {
if (value.match(/'/)) {
return `"${value}"`;
}
return `'${value}'`;
}
async function exportEleventyPost(post) {
const lines = ['---', `date: ${post.date}`, `permalink: ${post.permalink}`];
if (post.artist) {
lines.push(`artist: ${escape(post.artist)}`);
}
if (post.album) {
lines.push(`album: ${escape(post.album)}`);
}
if (post.track) {
lines.push(`track: ${escape(post.track)}`);
}
if (post.tags) {
lines.push(`tags: [${post.tags.join(', ')}]`);
}
if (post.category) {
lines.push(`category: ${post.category}`);
}
if (post.link) {
lines.push(`link: '${post.link}'`);
}
lines.push('---');
lines.push('');
lines.push(post.templateContent);
lines.push();
const targetContent = lines.join('\n');
const targetFileName = `${TARGET_DIR}/${post.fileName}`;
try {
await writeFile(targetFileName, targetContent);
return targetFileName;
} catch (error) {
console.error(`Could not export post ${targetFileName}.`);
}
}
async function exportEleventy(posts) {
const targetFileNames = [];
for (const post of posts) {
const targetFileName = await exportEleventyPost(post);
if (targetFileName) {
targetFileNames.push(targetFileName);
}
}
return targetFileNames;
}
async function main() {
const files = await getTumblrHtmlFiles(HTML_DIR);
console.log(`Found ${files.length} HTML files.`);
const posts = await parseTumblrHtmlFiles(files);
console.log(`Parsed ${posts.length} posts.`);
const postsMissingDate = posts.filter(
(post) => !post.date || post.date === 'Invalid Date'
);
if (postsMissingDate.length > 0) {
for (const post of postsMissingDate) {
console.log(post);
}
console.log(`${postsMissingDate.length} posts are missing a date.`);
process.exit(1);
}
const missingCategory = posts.filter((post) => !post.category);
if (missingCategory.length > 0) {
console.log(`${missingCategory.length} posts are missing a category.`);
}
const postsMissingLink = posts.filter((post) => !post.link);
if (postsMissingLink.length > 0) {
console.log(`${postsMissingLink.length} posts are missing a link.`);
}
const postsMissingArtist = posts.filter((post) => !post.artist);
if (postsMissingArtist.length > 0) {
console.log(`${postsMissingArtist.length} posts are missing the artist.`);
}
const targetFileNames = await exportEleventy(posts);
console.log(`Exported ${targetFileNames.length} to Eleventy.`);
}
main();