~tieong/download-course

ffddff5e943c99ece508fa1a24afa2279590176e — Thomas Ieong 2 years ago 6f21d3f
Added workaround.

When the file doesn't point directly to the resource, try to go to the
page where it redirects and get the resource link.
3 files changed, 114 insertions(+), 37 deletions(-)

M README.md
M download_course/download_course.py
M download_course/utils/selectors.py
M README.md => README.md +3 -0
@@ 3,6 3,9 @@
Short script to download courses from the REPLACE_ME platform.
Need to set the username and the password as environment variables.

Some files are getting redownloaded everytime, from the little I've seen this mostly concerns
the interactive html module, they change etag every single time, so there isn't much to do.


```
usage: download_course.py [-h] --city CITY [--promotion PROMOTION] [-o OUTPUT_DIR] [--debug]

M download_course/download_course.py => download_course/download_course.py +110 -37
@@ 265,31 265,76 @@ class REPLACE_MEPage():

    def _get_filename_and_etag(
            self,
            document_link_head: requests.models.Response
            document_link_head: requests.models.Response,
            workaround: bool
    ) -> tuple[str, str]:
        file_name = ""
        etag = ""
        try:
            document_link = self.session.head(
                        document_link_head.headers["Location"]
            )
            etag = document_link.headers["etag"].replace('"', "")
            file_name = document_link.headers[
                        "Content-Disposition"
            ].split("=")[-1].replace('"', "")
            if not workaround:
                document_link = self.session.head(
                            document_link_head.headers["Location"]
                )
                etag = document_link.headers["etag"].replace('"', "")
                file_name = document_link.headers[
                            "Content-Disposition"
                ].split("=")[-1].replace('"', "")
            else:
                etag = document_link_head.headers["etag"].replace('"', "")
                file_name = document_link_head.headers[
                            "Content-Disposition"
                ].split("=")[-1].replace('"', "")
        except KeyError:
            # This happens when the file points to an html module.
            logging.info(
                "The current file not downloadable and will be skipped!"
            logging.debug(
                (
                    "The current file is not downloadable! "
                    "This happens when the file points "
                    "to an interactive html module or an external link! "
                    "Trying the workaround..."
                )
            )
        return file_name, etag

    def _get_file_workaround(
            self,
            document_element: Tag
    ) -> Tag | None:
        """
        This take cares of the case where
        the link doesn't point to the file directly.
        """
        document_tag = None
        document_link_redirect = document_element.attrs["href"]
        redirect_page = self.session.get(document_link_redirect)
        soup = BeautifulSoup(redirect_page.content, features="lxml")
        document_tag = soup.select_one(
            REPLACE_MESelectors.RESOURCE_WORKAROUND.value
        )
        return document_tag

    def _download_file_workaround(
            self,
            document_link_head: requests.models.Response,
            workaround: bool,
    ) -> requests.models.Response:
        if not workaround:
            document_dl = self.session.get(
                document_link_head.headers["Location"]
            )
        else:
            document_dl = self.session.get(
                document_link_head.url
            )
        return document_dl

    # pylint: disable=too-many-arguments
    def download_files(
            self,
            file_name: str,
            etag: str,
            document_link_head: requests.models.Response,
            section_dir: pathlib.Path
            section_dir: pathlib.Path,
            workaround: bool
    ) -> None:
        """Download files if they are missing or outdated."""
        init_config()


@@ 303,8 348,9 @@ class REPLACE_MEPage():
            )
            file_metadata = {str(document_path): etag}
            update_config(file_metadata)
            document_dl = self.session.get(
                document_link_head.headers["Location"]
            document_dl = self._download_file_workaround(
                document_link_head,
                workaround
            )
            document_path.write_bytes(document_dl.content)
            self.files_downloaded += 1


@@ 314,8 360,9 @@ class REPLACE_MEPage():
                file_name
            )
            update_etag(document_path, etag)
            document_dl = self.session.get(
                document_link_head.headers["Location"]
            document_dl = self._download_file_workaround(
                document_link_head,
                workaround
            )
            document_path.write_bytes(document_dl.content)
            self.files_downloaded += 1


@@ 325,14 372,48 @@ class REPLACE_MEPage():
                file_name
            )

    def download_document(
            self,
            document_element: Tag,
            section_dir: pathlib.Path
    ) -> None:
        """Download documents."""
        workaround = False
        document_link_head = self._get_document_link_head(
            document_element
        )
        file_name, etag = self._get_filename_and_etag(
            document_link_head,
            False
        )
        if not file_name or not etag:
            document_tag = self._get_file_workaround(
                document_element
            )
            if document_tag is None:
                return
            document_link_head = self._get_document_link_head(
                document_tag
            )
            file_name, etag = self._get_filename_and_etag(
                document_link_head,
                True
            )
            workaround = True
        self.download_files(
            file_name,
            etag,
            document_link_head,
            section_dir,
            workaround
        )

    def get_courses_materials(
            self,
            course_brick_element: Tag,
            download_dir_path: pathlib.Path,
            promotion: str
    ) -> None:
        """Download all the resources files."""
        download_dir_path = download_dir_path / promotion
        courses_section_elements = self._get_courses_section_elements(
            course_brick_element
        )


@@ 356,25 437,13 @@ class REPLACE_MEPage():
                else:
                    continue
                for document_element in documents_element:
                    document_link_head = self._get_document_link_head(
                        document_element
                    )
                    file_name, etag = self._get_filename_and_etag(
                        document_link_head
                    )
                    if not file_name or not etag:
                        continue
                    self.download_files(
                        file_name,
                        etag,
                        document_link_head,
                        section_dir
                    )
                    self.download_document(document_element, section_dir)


def download_courses(
        args: argparse.Namespace,
        REPLACE_ME_session: requests.Session
        REPLACE_ME_session: requests.Session,
        download_dir_path: pathlib.Path
) -> None:
    """Download courses files on the moodle platform."""
    REPLACE_ME_page = REPLACE_MEPage(REPLACE_ME_session)


@@ 400,26 469,30 @@ def download_courses(
        )
        REPLACE_ME_page.get_courses_materials(
            course_brick,
            args.output_dir,
            args.promotion
            download_dir_path,
        )
    logging.info("The config file is located at %s", STORAGE_FILE)
    logging.info(
        "The number of files downloaded is : %s",
        REPLACE_ME_page.files_downloaded
    )
    logging.info(
        "The files are located in %s",
        download_dir_path
    )


def main() -> None:
    """Entry point."""
    args = get_args()
    (args.output_dir / args.promotion).mkdir(exist_ok=True)
    download_dir_path = args.output_dir / args.promotion
    download_dir_path.mkdir(exist_ok=True)
    if args.debug:
        logging.getLogger().setLevel(logging.DEBUG)

    REPLACE_ME_session = requests.Session()
    with REPLACE_ME_session:
        download_courses(args, REPLACE_ME_session)
        download_courses(args, REPLACE_ME_session, download_dir_path)


if __name__ == "__main__":

M download_course/utils/selectors.py => download_course/utils/selectors.py +1 -0
@@ 30,3 30,4 @@ class REPLACE_MESelectors(Enum):
    FORM = "form"
    TOPICS = ".topics > li"
    DOCUMENTS = "ul > li[class='activity resource modtype_resource'] * a"
    RESOURCE_WORKAROUND = ".resourceworkaround > a"