@@ 3,6 3,9 @@
Short script to download courses from the REPLACE_ME platform.
Need to set the username and the password as environment variables.
+Some files are getting redownloaded everytime, from the little I've seen this mostly concerns
+the interactive html module, they change etag every single time, so there isn't much to do.
+
```
usage: download_course.py [-h] --city CITY [--promotion PROMOTION] [-o OUTPUT_DIR] [--debug]
@@ 265,31 265,76 @@ class REPLACE_MEPage():
def _get_filename_and_etag(
self,
- document_link_head: requests.models.Response
+ document_link_head: requests.models.Response,
+ workaround: bool
) -> tuple[str, str]:
file_name = ""
etag = ""
try:
- document_link = self.session.head(
- document_link_head.headers["Location"]
- )
- etag = document_link.headers["etag"].replace('"', "")
- file_name = document_link.headers[
- "Content-Disposition"
- ].split("=")[-1].replace('"', "")
+ if not workaround:
+ document_link = self.session.head(
+ document_link_head.headers["Location"]
+ )
+ etag = document_link.headers["etag"].replace('"', "")
+ file_name = document_link.headers[
+ "Content-Disposition"
+ ].split("=")[-1].replace('"', "")
+ else:
+ etag = document_link_head.headers["etag"].replace('"', "")
+ file_name = document_link_head.headers[
+ "Content-Disposition"
+ ].split("=")[-1].replace('"', "")
except KeyError:
- # This happens when the file points to an html module.
- logging.info(
- "The current file not downloadable and will be skipped!"
+ logging.debug(
+ (
+ "The current file is not downloadable! "
+ "This happens when the file points "
+ "to an interactive html module or an external link! "
+ "Trying the workaround..."
+ )
)
return file_name, etag
+ def _get_file_workaround(
+ self,
+ document_element: Tag
+ ) -> Tag | None:
+ """
+ This take cares of the case where
+ the link doesn't point to the file directly.
+ """
+ document_tag = None
+ document_link_redirect = document_element.attrs["href"]
+ redirect_page = self.session.get(document_link_redirect)
+ soup = BeautifulSoup(redirect_page.content, features="lxml")
+ document_tag = soup.select_one(
+ REPLACE_MESelectors.RESOURCE_WORKAROUND.value
+ )
+ return document_tag
+
+ def _download_file_workaround(
+ self,
+ document_link_head: requests.models.Response,
+ workaround: bool,
+ ) -> requests.models.Response:
+ if not workaround:
+ document_dl = self.session.get(
+ document_link_head.headers["Location"]
+ )
+ else:
+ document_dl = self.session.get(
+ document_link_head.url
+ )
+ return document_dl
+
+ # pylint: disable=too-many-arguments
def download_files(
self,
file_name: str,
etag: str,
document_link_head: requests.models.Response,
- section_dir: pathlib.Path
+ section_dir: pathlib.Path,
+ workaround: bool
) -> None:
"""Download files if they are missing or outdated."""
init_config()
@@ 303,8 348,9 @@ class REPLACE_MEPage():
)
file_metadata = {str(document_path): etag}
update_config(file_metadata)
- document_dl = self.session.get(
- document_link_head.headers["Location"]
+ document_dl = self._download_file_workaround(
+ document_link_head,
+ workaround
)
document_path.write_bytes(document_dl.content)
self.files_downloaded += 1
@@ 314,8 360,9 @@ class REPLACE_MEPage():
file_name
)
update_etag(document_path, etag)
- document_dl = self.session.get(
- document_link_head.headers["Location"]
+ document_dl = self._download_file_workaround(
+ document_link_head,
+ workaround
)
document_path.write_bytes(document_dl.content)
self.files_downloaded += 1
@@ 325,14 372,48 @@ class REPLACE_MEPage():
file_name
)
+ def download_document(
+ self,
+ document_element: Tag,
+ section_dir: pathlib.Path
+ ) -> None:
+ """Download documents."""
+ workaround = False
+ document_link_head = self._get_document_link_head(
+ document_element
+ )
+ file_name, etag = self._get_filename_and_etag(
+ document_link_head,
+ False
+ )
+ if not file_name or not etag:
+ document_tag = self._get_file_workaround(
+ document_element
+ )
+ if document_tag is None:
+ return
+ document_link_head = self._get_document_link_head(
+ document_tag
+ )
+ file_name, etag = self._get_filename_and_etag(
+ document_link_head,
+ True
+ )
+ workaround = True
+ self.download_files(
+ file_name,
+ etag,
+ document_link_head,
+ section_dir,
+ workaround
+ )
+
def get_courses_materials(
self,
course_brick_element: Tag,
download_dir_path: pathlib.Path,
- promotion: str
) -> None:
"""Download all the resources files."""
- download_dir_path = download_dir_path / promotion
courses_section_elements = self._get_courses_section_elements(
course_brick_element
)
@@ 356,25 437,13 @@ class REPLACE_MEPage():
else:
continue
for document_element in documents_element:
- document_link_head = self._get_document_link_head(
- document_element
- )
- file_name, etag = self._get_filename_and_etag(
- document_link_head
- )
- if not file_name or not etag:
- continue
- self.download_files(
- file_name,
- etag,
- document_link_head,
- section_dir
- )
+ self.download_document(document_element, section_dir)
def download_courses(
args: argparse.Namespace,
- REPLACE_ME_session: requests.Session
+ REPLACE_ME_session: requests.Session,
+ download_dir_path: pathlib.Path
) -> None:
"""Download courses files on the moodle platform."""
REPLACE_ME_page = REPLACE_MEPage(REPLACE_ME_session)
@@ 400,26 469,30 @@ def download_courses(
)
REPLACE_ME_page.get_courses_materials(
course_brick,
- args.output_dir,
- args.promotion
+ download_dir_path,
)
logging.info("The config file is located at %s", STORAGE_FILE)
logging.info(
"The number of files downloaded is : %s",
REPLACE_ME_page.files_downloaded
)
+ logging.info(
+ "The files are located in %s",
+ download_dir_path
+ )
def main() -> None:
"""Entry point."""
args = get_args()
- (args.output_dir / args.promotion).mkdir(exist_ok=True)
+ download_dir_path = args.output_dir / args.promotion
+ download_dir_path.mkdir(exist_ok=True)
if args.debug:
logging.getLogger().setLevel(logging.DEBUG)
REPLACE_ME_session = requests.Session()
with REPLACE_ME_session:
- download_courses(args, REPLACE_ME_session)
+ download_courses(args, REPLACE_ME_session, download_dir_path)
if __name__ == "__main__":