~metalune/simplynews_sites

3e12040635cea01421467897225184062ba2f23a — bopol 3 days ago 36eee73
Le Figaro: support live articles
2 files changed, 152 insertions(+), 39 deletions(-)

A simplynews_sites/helpers/utils.py
M simplynews_sites/lefigaro.py
A simplynews_sites/helpers/utils.py => simplynews_sites/helpers/utils.py +12 -0
@@ 0,0 1,12 @@
import bs4


def value_in_element_attr(element, value, attr="class"):
    if not type(element) == bs4.element.Tag:
        return False
    else:
        attrs = element.attrs
        if attrs is not None and attrs != {}:
            return value in element.attrs.get(attr)
        else:
            return False

M simplynews_sites/lefigaro.py => simplynews_sites/lefigaro.py +140 -39
@@ 1,10 1,11 @@
from .helpers import rss
from datetime import timedelta
from .helpers import rss, utils
from datetime import timedelta, datetime
from bs4 import BeautifulSoup
import requests

import feedparser
import urllib
import json
import bs4

cache_refresh_time_delta = timedelta(hours=3)
identifier = "lefigaro"


@@ 15,10 16,17 @@ rss_feed = f"{base_url}/rss/figaro_actualites.xml"


def get_image(img):

    data_srcset = img.get("data-srcset")
    if data_srcset is not None:
        src = data_srcset.split()[0]
    else:
        src = img["src"]

    return {
        "type": "image",
        "src": img["data-srcset"].split()[0],
        "alt": img["alt"]
        "src": src,
        "alt": img.get("alt")
    }




@@ 51,21 59,33 @@ def get_page(url):

    post = soup.select_one("article")

    json_element = soup.find("script", type="application/ld+json")
    if json_element is not None:
        info_json = json.loads(json_element.next)

    if subtitle.endswith("..."):
        standfirst = post.select_one("p.fig-standfirst")
        if standfirst is not None:
            subtitle = standfirst.text

    meta_info = post.select_one("div.fig-content-metas-info")
    author = meta_info.select_one("span.fig-content-metas__authors").text
    last_updated = meta_info.select_one(
        "span.fig-content-metas__pub-maj-date > time")

    if last_updated is None:  # It hasn't been updated, then we get published date
        last_updated = meta_info.select_one(
            "span.fig-content-metas__pub-date > time")

    last_updated = last_updated.text
    for element in info_json:
        if element["@type"] == "NewsArticle":
            last_updated = element.get("dateModified")
            if last_updated is None:
                last_updated = element.get("datePublished")
            author_array = element.get("author")
            if author_array is not None:
                authors = []
                for author_obj in author_array:
                    authors.append(author_obj["name"])
                author = ", ".join(authors)
            else:
                author = "Unknown"

    last_updated_datetime = datetime.strptime(
        last_updated, "%Y-%m-%dT%H:%M:%S.%fZ")
    if last_updated_datetime is not None:
        last_updated = str(last_updated_datetime)

    data = {
        "title": title,


@@ 76,14 96,15 @@ def get_page(url):

    article = []

    heading_image = post.select_one("article > figure.fig-media img")
    heading_image = post.select_one(
        "article > figure.fig-media img") or soup.select_one("div.fig-wrapper figure.fig-media img")
    if heading_image is not None:
        article.append(get_image(heading_image))

    post_content = post.select_one("div.fig-body")

    if post_content is None:  # not an article
        poll_element = post.select_one("div.fig-poll")
    if post_content is None:  # not a regular article
        poll_element = post.select_one("div.fig-poll")  # poll "article"
        if poll_element is not None:
            entries = []
            results = poll_element.select("div.fig-poll__result")


@@ 99,31 120,37 @@ def get_page(url):
            }

            article.append(el)
            data["article"] = article
            return data

        live_messages = post.select("article.live-message")  # live "article"
        if live_messages is not None:
            for message in live_messages:
                message_title = message.select_one(".live-title")
                article.append({
                    "type": "header",
                    "size": "h2",
                    "value": message_title.text
                })
                date = message.select_one("time")
                if date is not None:
                    # date_time = datetime.fromisoformat(date["datetime"])
                    article.append({
                        "type": "paragraph",
                        "value": "Publié {}".format(date.text)
                    })

                message_body = message.select_one("div.live-article")
                for element in message_body:
                    el = get_element(element, True)
                    if el is not None and el != {}:
                        article.append(el)

        data["article"] = article
        return data

    for element in post_content:
        el = {}
        if element.name == "p" and 'fig-paragraph' in element.attrs.get("class"):
            strong = element.select_one("strong")
            if strong is None or not strong.text.startswith("À voir aussi"):
                el["type"] = "paragraph"
                el["value"] = element.text
        elif element.name == "div" and "fig-premium-paywall" in element.attrs.get("class"):
            # paywall. Display info (% left) without info encouraging to subscribe
            info = element.select_one("p.fig-premium-paywall__infos")
            if info is not None:
                el["type"] = "paragraph"
                el["value"] = info.text
        elif element.name == "figure":
            img = element.select_one("img")
            if img is not None:
                article.append(get_image(img))
        elif element.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
            el["type"] = "header"
            el["size"] = element.name
            el["value"] = element.text
        el = get_element(element)

        if el is not None and el != {}:
            article.append(el)


@@ 148,8 175,76 @@ def get_recent_articles():
    return feed_


def is_related_article(element):
    if element is None:
        return False

    text = element.text.lower()
    return text.startswith("à voir aussi") or "lire aussi -" in text or "lire notre article" in text


def get_element(element, is_live=False):
    el = {}

    if type(element) == bs4.element.NavigableString:
        el["type"] = "text"
        el["value"] = str(element)
        return el

    if element.name == "p":
        if is_live or (not is_live and utils.value_in_element_attr(element, "fig-paragraph")):
            strong = element.select_one("strong")
            if not is_related_article(strong) and not is_related_article(element):
                el["type"] = "paragraph"
                el["value"] = element.text
    elif element.name == "div" and utils.value_in_element_attr(element, "fig-premium-paywall"):
        # paywall. Display info (% left) without info encouraging to subscribe
        info = element.select_one("p.fig-premium-paywall__infos")
        if info is not None:
            el["type"] = "paragraph"
            el["value"] = info.text
    elif element.name == "figure":
        img = element.select_one("img")
        if img is not None:
            return get_image(img)
        blockquote = element.select_one("blockquote")
        if blockquote is not None:
            el["type"] = "blockquote"
            el["value"] = blockquote.text
            return el
        span = element.select_one("span")
        if span is not None and "data-html" in span.attrs:
            data_html = BeautifulSoup(span["data-html"], "lxml")
            iframe = data_html.select_one("iframe")
            if iframe is not None:
                el["type"] = "iframe"
                el["src"] = iframe["src"]
                el["width"] = iframe.get("width")
                el["height"] = iframe.get("height")
                return el

            img = data_html.select_one("img")
            if img is not None:
                el = get_image(img)

    elif element.name == "strong" and not is_related_article(element):
        el["type"] = "strong"
        el["value"] = element.text
    elif element.name == "em" or element.name == "i":
        el["type"] = "em"
        el["value"] = element.text
    elif element.name == "br":
        el["type"] = "linebreak"
    elif element.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
        el["type"] = "header"
        el["size"] = element.name
        el["value"] = element.text

    return el


if __name__ == "__main__":
    page_url = "politique/menu-sans-viande-dans-les-cantines-de-lyon-darmanin-denonce-la-mesure-doucet-lui-repond-20210221"
    # page_url = "politique/menu-sans-viande-dans-les-cantines-de-lyon-darmanin-denonce-la-mesure-doucet-lui-repond-20210221"

    # page_url = "flash-actu/libye-le-ministre-de-l-interieur-echappe-a-une-tentative-d-assassinat-20210221"
    # not updated (yet)


@@ 166,5 261,11 @@ if __name__ == "__main__":
    # page_url = "vox/societe/didier-lemaire-a-trappes-nous-ne-sommes-plus-en-france-20210219"
    # subtitle cut

    # page_url = "sciences/en-direct-covid-19-les-alpes-maritimes-attendent-les-decisions-du-gouvernement-20210222"
    # "live" article

    page_url = "confinement-partiel-commerces-ce-qu-il-faut-retenir-des-mesures-de-restriction-dans-les-alpes-maritimes-20210222"
    # multiple authors

    page = get_page(page_url)
    print(page)