~metalune/simplynews_sites

c8a0e5b6b25bf7306496d8b3aab1badf056a7451 — bopol 13 days ago b801798
franceinfo: fix videos, standardize date / authors, fix empty paragraphs
2 files changed, 52 insertions(+), 49 deletions(-)

M .gitignore
M simplynews_sites/franceinfo.py
M .gitignore => .gitignore +1 -0
@@ 4,3 4,4 @@ simplynews_sites/helpers/__pycache__
build/
dist/
simplynews_sites.egg-info/
.vscode/

M simplynews_sites/franceinfo.py => simplynews_sites/franceinfo.py +51 -49
@@ 1,5 1,5 @@
from .helpers import rss, utils
from datetime import timedelta
from datetime import timedelta, datetime
from bs4 import BeautifulSoup
from html import unescape
import requests


@@ 10,9 10,10 @@ identifier = "franceinfo"
site_title = "Franceinfo"

base_url = "https://www.francetvinfo.fr"

rss_feed = f"{base_url}/titres.rss"

DATE_PATTERN = "%Y-%m-%dT%H:%M:%S%z"


def get_image(img):
    if img is None:


@@ 62,55 63,36 @@ def get_page(url):
    subtitle = unescape(subtitle)

    json_element = soup.find("script", type="application/ld+json")
    if json_element:
        info_json = json.loads(json_element.next)
    info = json.loads(json_element.next)

    post = soup.select_one("article")
    datePublished = info["datePublished"]
    dateModified = info["dateModified"]
    if dateModified:
        last_updated = datetime.strptime(dateModified, DATE_PATTERN)
    else:
        last_updated = datetime.strptime(datePublished, DATE_PATTERN)

    author = info["author"]
    authors = []
    author_group = None
    if isinstance(author, list):
        for auth in author:
            authors.append(auth["name"])
    elif isinstance(author, dict):
        authors.append(author["name"])
    else:
        print("could not get author")
    author = ", ".join(authors)

    post = soup.select_one("article")
    aside = post.select_one("aside")

    if aside:
        aside = post.select_one("aside")

        publish_date = aside.select_one("p.publish-date")
        # there are two <time> in this object, the first one is last_updated, the second one is the original publish date
        last_updated = publish_date.select_one("time").text

        author_list = aside.select_one("div.authors-list")
        for author in author_list.select(".author"):
            authors.append(author.text)

        author_group = author_list.select_one(".group").text

        heading_image = post.select_one("div.left-wrapper > figure img")

        post_content = post.select_one("div.text")

    else:
        publish_date = post.select_one("div.publication-date")
        # there are two dates in this object, the first one is the publish date, the second one is the last_updated
        dates = publish_date.select("time")
        # last in the list
        last_updated = dates[-1].text

        author_list = post.select_one("div.c-signature__authors")
        for author in author_list.select(".c-signature__names"):
            authors.append(author.text)

        author_group = author_list.select_one(
            ".c-signature__group-team-wrapper").text

        heading_image = post.select_one("div.c-cover figure img")

        post_content = post.select_one("div.c-body")

    author = ", ".join(authors)
    if author_group:
        author = "{} ({})".format(author, author_group.strip(" \n"))

    data = {
        "title": title,
        "subtitle": subtitle,


@@ 123,26 105,37 @@ def get_page(url):
    if heading_image:
        article.append(get_image(heading_image))

    heading_video = post.select_one("figure.video")
    heading_video = post.select_one(
        ".c-cover > .resp-video") or post.select_one("figure.video")
    if heading_video:
        iframe_element = heading_video.select_one("iframe")
        iframe = get_iframe(iframe_element)
        if iframe:
            article.append(iframe)

    heading_video = post.select_one("figure.player-video")
    heading_video = post.select_one(
        ".c-cover > figure.francetv-player-wrapper") or post.select_one("figure.player-video")
    if heading_video:
        if info_json and "video" in info_json:
            video = info_json["video"]
        if info and "video" in info:
            video = info["video"]

            if isinstance(video, list):
                video = video[0]

            src = video.get("embedUrl") or video.get("embedURL")
            width = video.get("width")
            if width:
                width = width.get("value")
            height = video.get("height")
            if height:
                height = height.get("value")

            article.append({
                "type": "iframe",
                "src": video["embedURL"],
                "width": video["width"]["value"],
                "height": video["height"]["value"]
                "src": src,
                "title": video.get("name"),
                "width": width,
                "height": height
            })

        # else: embedded live stream, but it's not using iframe and blob src url for <video> then hard to extract


@@ 167,8 160,7 @@ def get_page(url):
                el = get_image(img)

            elif ">>" not in element.text:  # ignore related article links
                el["type"] = "paragraph"
                el["value"] = element.text
                el = get_paragraph(element.text)

        elif element.name == "blockquote":
            el["type"] = "blockquote"


@@ 187,8 179,7 @@ def get_page(url):
                el = {}

        elif element.name == "span":
            el["type"] = "paragraph"
            el["value"] = element.text
            el = get_paragraph(element.text)

        elif element.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
            el["type"] = "header"


@@ 215,6 206,17 @@ def get_page(url):
    return data


def get_paragraph(text):
    cleaned = text.strip("\n")
    if cleaned == "":
        return None
    else:
        return {
            "type": "paragraph",
            "value": cleaned
        }


def get_recent_articles():
    return rss.default_feed_parser(rss_feed)