~tieong/download-course

6f21d3fa2693df55fe9b0f691c28940fe91b6dbd — Thomas Ieong 2 years ago 58d1c35
Store the file_path instead of the file_name

To account for the case where files have the same name and in doing so
prevent them from being redownloaded every time the script is run.
M download_course/download_course.py => download_course/download_course.py +11 -8
@@ 25,6 25,7 @@ from download_course.utils.config_json import (
    init_config,
    update_config,
    update_etag,
    STORAGE_FILE
)
from download_course.utils.selectors import (
    REPLACE_MESelectors,


@@ 292,31 293,32 @@ class REPLACE_MEPage():
    ) -> None:
        """Download files if they are missing or outdated."""
        init_config()
        file_downloaded = check_file_downloaded(file_name)
        is_etag_same = check_file_etag(etag)
        document_path = section_dir / file_name
        file_downloaded = check_file_downloaded(document_path)
        is_etag_same = check_file_etag(document_path, etag)
        if not file_downloaded:
            logging.info(
                "The file isn't in the json, downloading %s",
                file_name
            )
            file_metadata = {file_name: etag}
            file_metadata = {str(document_path): etag}
            update_config(file_metadata)
            document_dir = section_dir / file_name
            document_dl = self.session.get(
                document_link_head.headers["Location"]
            )
            document_dir.write_bytes(document_dl.content)
            document_path.write_bytes(document_dl.content)
            self.files_downloaded += 1
        elif not is_etag_same:
            logging.info(
                "The etags are different! Downloading %s",
                file_name
            )
            update_etag(file_name, etag)
            document_dir = section_dir / file_name
            update_etag(document_path, etag)
            document_dl = self.session.get(
                document_link_head.headers["Location"]
            )
            document_dir.write_bytes(document_dl.content)
            document_path.write_bytes(document_dl.content)
            self.files_downloaded += 1
        else:
            logging.debug(
                "The file %s has already been downloaded!",


@@ 401,6 403,7 @@ def download_courses(
            args.output_dir,
            args.promotion
        )
    logging.info("The config file is located at %s", STORAGE_FILE)
    logging.info(
        "The number of files downloaded is : %s",
        REPLACE_ME_page.files_downloaded

M download_course/utils/config_json.py => download_course/utils/config_json.py +2 -4
@@ 3,7 3,6 @@
"""Regroup all the routines pertaining the download of courses."""

import sys
import logging
import pathlib
import json
from typing import Dict, Any


@@ 34,7 33,6 @@ elif sys.platform == "darwin":

def init_config() -> None:
    """Init the json if it doesn't exists"""
    logging.info("The config file is located at %s", STORAGE_FILE)
    if not STORAGE_FILE.exists():
        STORAGE_FILE.write_text(
            json.dumps({}, ensure_ascii=False, indent=4),


@@ 47,10 45,10 @@ def read_config() -> Any:
    return json.loads(STORAGE_FILE.read_text(encoding="utf-8"))


def update_etag(file_name: str, etag: str) -> None:
def update_etag(file_path: pathlib.Path, etag: str) -> None:
    """Update etags in the json file"""
    data = json.loads(STORAGE_FILE.read_text(encoding="utf-8"))
    data[file_name] = etag
    data[str(file_path)] = etag

    STORAGE_FILE.write_text(
        json.dumps(data, ensure_ascii=False, indent=4),

M download_course/utils/utils.py => download_course/utils/utils.py +8 -5
@@ 36,21 36,24 @@ def slugify(
    return re.sub(r'[-\s]+', '-', string).strip('-_')


def check_file_downloaded(file_name: str) -> bool:
def check_file_downloaded(file_path: pathlib.Path) -> bool:
    """Check if the file has already been downloaded."""
    file_downloaded = False
    config = read_config()
    if file_name in config:
    if str(file_path) in config:
        file_downloaded = True
    return file_downloaded


def check_file_etag(etag: str) -> bool:
def check_file_etag(file_path: pathlib.Path, etag: str) -> bool:
    """Check if the file is up to date."""
    is_same_etag = False
    config = read_config()
    if etag in config.values():
        is_same_etag = True
    try:
        if config[str(file_path)] == etag:
            is_same_etag = True
    except KeyError:
        pass
    return is_same_etag