~ciriarte/apizotl

29b6054437bb1df4e4f48d54c9c70d35b15345fb — Carlos Iriarte 2 years ago 6f846ed
fix: add support for legislatures
9 files changed, 184 insertions(+), 36 deletions(-)

A .vscode/launch.json
A .vscode/settings.json
R src/{deputies.rs => congress/deputies.rs}
A src/congress/legislatures.rs
A src/congress/mod.rs
R src/{senators.rs => congress/senators.rs}
M src/main.rs
M src/opt.rs
A src/supreme-court.rs
A .vscode/launch.json => .vscode/launch.json +45 -0
@@ 0,0 1,45 @@
{
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "type": "lldb",
            "request": "launch",
            "name": "Debug executable 'apizotl'",
            "cargo": {
                "args": [
                    "build",
                    "--bin=apizotl",
                    "--package=apizotl"
                ],
                "filter": {
                    "name": "apizotl",
                    "kind": "bin"
                }
            },
            "args": ["senators"],
            "cwd": "${workspaceFolder}"
        },
        {
            "type": "lldb",
            "request": "launch",
            "name": "Debug unit tests in executable 'apizotl'",
            "cargo": {
                "args": [
                    "test",
                    "--no-run",
                    "--bin=apizotl",
                    "--package=apizotl"
                ],
                "filter": {
                    "name": "apizotl",
                    "kind": "bin"
                }
            },
            "args": [],
            "cwd": "${workspaceFolder}"
        }
    ]
}
\ No newline at end of file

A .vscode/settings.json => .vscode/settings.json +4 -0
@@ 0,0 1,4 @@
{
    "rust-analyzer.cargo.allFeatures": true,
    "rust-analyzer.checkOnSave.command": "clippy"
}
\ No newline at end of file

R src/deputies.rs => src/congress/deputies.rs +3 -14
@@ 81,7 81,7 @@ fn extract(link: String) -> std::result::Result<Vec<CongressMember>, Error> {
    println!("{:?}", link);
    let url = Url::parse(&link)?;

    let mut res = reqwest::blocking::get(url.to_owned())?;
    let mut res = reqwest::blocking::get(url)?;

    let mut buf: Vec<u8> = vec![];
    res.copy_to(&mut buf)?;


@@ 92,7 92,7 @@ fn extract(link: String) -> std::result::Result<Vec<CongressMember>, Error> {
    let document = kuchiki::parse_html().one(s);
    let h3_list = document.select("h3").unwrap();

    for h3 in h3_list.filter_map(|h| with_title(h, "Diputados por distrito".to_string())) {
    for h3 in h3_list.filter_map(|h| super::with_title(h, "Diputados por distrito".to_string())) {
        let table = h3.as_node().next_sibling().unwrap().next_sibling().unwrap();
        println!("  {:?}", h3.as_node().text_contents());
        for tr in table.select("tr:not(:first-child)").unwrap() {


@@ 121,7 121,7 @@ fn extract(link: String) -> std::result::Result<Vec<CongressMember>, Error> {
    }

    let h3_list = document.select("h3").unwrap();
    for h3 in h3_list.filter_map(|h| with_title(h, "Diputados por representaci".to_string())) {
    for h3 in h3_list.filter_map(|h| super::with_title(h, "Diputados por representaci".to_string())) {
        let table = h3.as_node().next_sibling().unwrap().next_sibling().unwrap();
        println!("  {:?}", h3.as_node().text_contents());
        for tr in table.select("tr:not(:first-child)").unwrap() {


@@ 155,14 155,3 @@ fn extract(link: String) -> std::result::Result<Vec<CongressMember>, Error> {

    Ok(deputies)
}

fn with_title(
    h: kuchiki::NodeDataRef<kuchiki::ElementData>,
    title: String,
) -> Option<kuchiki::NodeDataRef<kuchiki::ElementData>> {
    if h.text_contents().starts_with(&title) {
        Some(h)
    } else {
        None
    }
}
\ No newline at end of file

A src/congress/legislatures.rs => src/congress/legislatures.rs +94 -0
@@ 0,0 1,94 @@
use async_std::{process, task};
use failure::Error;

use regex::Regex;

use kuchiki::{ElementData, traits::*};

use crate::{log_error, opt::Opt};

use reqwest::Url;
#[derive(Debug)]
struct Legislature {
    name: String,
    start: String,
    end: String
}

pub fn run(opts: Opt) {
    task::block_on(async {
        if let Err(e) = process(&opts).await {
            log_error(&e);
            process::exit(1);
        };
    });
}

async fn process(_opts: &Opt) -> std::result::Result<Vec<Legislature>, Error> {
    let mut legislatures: Vec<Legislature> = vec![];

    let url = 
        Url::parse(
            "https://es.wikipedia.org/wiki/Anexo:Congresos_y_Legislaturas_del_Congreso_de_la_Uni%C3%B3n_de_M%C3%A9xico"
        )?;

    let mut res = reqwest::blocking::get(url.to_owned())?;

    let mut buf: Vec<u8> = vec![];
    res.copy_to(&mut buf)?;
    let s: String = String::from_utf8(buf)?;

    let row_selector = ".wikitable tbody tr";
    let document = kuchiki::parse_html().one(s);

    for tr in document.select(row_selector).unwrap() {
        let cells: Vec<kuchiki::NodeDataRef<ElementData>> = 
            tr.as_node()
              .select("td")
              .unwrap()
              .collect();

        // for some reason kuchiki captures tr from thead
        if cells.is_empty() {
            continue;
        }

        let a = cells[0].as_node().select_first("a").unwrap();
        let href = a
            .attributes
            .borrow()
            .get("href")
            .unwrap_or_default()
            .to_string();
        let name = a.text_contents();

        let link = if href.starts_with("/wiki") {
            let link_str =
                format!("{}://{}{}", url.scheme(), url.host_str().unwrap(), href).to_string();
            Some(link_str)
        } else {
            None
        };

        let date_pattern = Regex::new(r"(?:\d+\s+de\s)?\w+\sde\s\d+").unwrap();

        let start = cells[1].as_node().text_contents();
        let start = date_pattern.captures(&start)
            .unwrap().get(0).unwrap().as_str().to_owned();

        let end = cells[2].text_contents();
        let end = match date_pattern.captures(&end) {
            Some(m) => m.get(0).unwrap().as_str().to_owned(),
            None => "".to_owned()
        };
        let l = Legislature {
            name,
            start,
            end
        };

        legislatures.push(l);
    }

    Ok(legislatures)
}
\ No newline at end of file

A src/congress/mod.rs => src/congress/mod.rs +14 -0
@@ 0,0 1,14 @@
pub mod legislatures;
pub mod deputies;
pub mod senators;

fn with_title(
    h: kuchiki::NodeDataRef<kuchiki::ElementData>,
    title: String,
) -> Option<kuchiki::NodeDataRef<kuchiki::ElementData>> {
    if h.text_contents().starts_with(&title) {
        Some(h)
    } else {
        None
    }
}
\ No newline at end of file

R src/senators.rs => src/congress/senators.rs +2 -13
@@ 71,7 71,7 @@ fn extract(link: String) -> std::result::Result<(), Error> {
    println!("{:?}", link);
    let url = Url::parse(&link)?;

    let mut res = reqwest::blocking::get(url.to_owned())?;
    let mut res = reqwest::blocking::get(url)?;

    let mut buf: Vec<u8> = vec![];
    res.copy_to(&mut buf)?;


@@ 80,7 80,7 @@ fn extract(link: String) -> std::result::Result<(), Error> {
    let document = kuchiki::parse_html().one(s);
    let h3_list = document.select("h3").unwrap();

    for h3 in h3_list.filter_map(|h| with_title(h, "Diputados por distrito".to_string())) {
    for h3 in h3_list.filter_map(|h| super::with_title(h, "Diputados por distrito".to_string())) {
        let table = h3.as_node().next_sibling().unwrap().next_sibling().unwrap();
        println!("  {:?}", h3.as_node().text_contents());
        for tr in table.select("tr:not(:first-child)").unwrap() {


@@ 109,14 109,3 @@ fn extract(link: String) -> std::result::Result<(), Error> {

    Ok(())
}

fn with_title(
    h: kuchiki::NodeDataRef<kuchiki::ElementData>,
    title: String,
) -> Option<kuchiki::NodeDataRef<kuchiki::ElementData>> {
    if h.text_contents().starts_with(&title) {
        Some(h)
    } else {
        None
    }
}

M src/main.rs => src/main.rs +4 -4
@@ 3,8 3,7 @@ use failure::Error;

mod opt;
mod presidents;
mod senators;
mod deputies;
mod congress;

use crate::opt::OutputType;



@@ 15,8 14,9 @@ fn main() {

    match output_type {
        OutputType::Presidents(opts) => crate::presidents::run(opts),
        OutputType::Senators(opts) => crate::senators::run(opts),
        OutputType::Deputies(opts) => crate::deputies::run(opts),
        OutputType::Legislatures(opts) => crate::congress::legislatures::run(opts),
        OutputType::Senators(opts) => crate::congress::senators::run(opts),
        OutputType::Deputies(opts) => crate::congress::deputies::run(opts),
    }
}


M src/opt.rs => src/opt.rs +18 -5
@@ 5,7 5,7 @@ use crate::VERSION;
#[derive(StructOpt, Debug, Clone)]
#[structopt(
    name = "apizotl",
    about = "Means \"glutton\" in Nahuatl, the Aztec's language. Eats raw data and organizes as an API",
    about = "Means \"glutton\" in Nahuatl, the Aztecs' language. Eats raw data and organizes as an API",
    version = VERSION,
    author = "ciriarte <me@ciriarte.dev>",
    setting = DeriveDisplayOrder,


@@ 18,8 18,17 @@ pub struct Opt {
#[derive(StructOpt, Debug, PartialEq, Clone)]
pub enum Command {
    Presidents,
    Deputies,
    Congress {
        #[structopt(subcommand)]
        command: CongressCommand,
    },
}

#[derive(StructOpt, Debug, PartialEq, Clone, Copy)]
pub enum CongressCommand {
    Legislatures,
    Senators,
    Deputies
}

pub fn parse_opts() -> OutputType {


@@ 27,13 36,17 @@ pub fn parse_opts() -> OutputType {

    match opts.command {
        Command::Presidents { .. } => OutputType::Presidents(opts),
        Command::Deputies { .. } => OutputType::Deputies(opts),
        Command::Senators { .. } => OutputType::Senators(opts),
        Command::Congress { command } => match command {
            CongressCommand::Legislatures { .. } => OutputType::Legislatures(opts),
            CongressCommand::Senators { .. } => OutputType::Senators(opts),
            CongressCommand::Deputies { .. } => OutputType::Deputies(opts),
        }
    }
}

pub enum OutputType {
    Presidents(Opt),
    Deputies(Opt),
    Legislatures(Opt),
    Senators(Opt),
    Deputies(Opt),
}
\ No newline at end of file

A src/supreme-court.rs => src/supreme-court.rs +0 -0