~ciriarte/apizotl

ff48811c56c51ca6d9d01649a76f63e52e12832f — Carlos Iriarte 2 years ago 8239416
wip: parsing deputies
2 files changed, 180 insertions(+), 16 deletions(-)

M src/deputies.rs
M src/senators.rs
M src/deputies.rs => src/deputies.rs +148 -2
@@ 1,5 1,151 @@
use crate::opt::Opt;
use serde::{Deserialize, Serialize};

pub fn run(_opts: Opt) {
use async_std::{process, task};
use failure::Error;

use kuchiki::{ElementData, traits::*};

use crate::{log_error, opt::Opt};

use reqwest::Url;

#[derive(Serialize, Deserialize, Debug)]
struct CongressMember {
    name: String,
    state: String,
    district: u8
}

pub fn run(opts: Opt) {
    task::block_on(async {
        if let Err(e) = process(&opts).await {
            log_error(&e);
            process::exit(1);
        };
    });
}

async fn process(_opts: &Opt) -> std::result::Result<(), Error> {
    let url = 
        Url::parse(
            "https://es.wikipedia.org/wiki/Anexo:Congresos_y_Legislaturas_del_Congreso_de_la_Uni%C3%B3n_de_M%C3%A9xico"
        )?;

    let mut res = reqwest::blocking::get(url.to_owned())?;

    let mut buf: Vec<u8> = vec![];
    res.copy_to(&mut buf)?;
    let s: String = String::from_utf8(buf)?;

    let table_selector = ".wikitable";
    let document = kuchiki::parse_html().one(s);

    for table_match in document.select(table_selector).unwrap() {
        let node = table_match.as_node();
        for a in node.select("td:first-child a").unwrap() {
            let href = a
                .attributes
                .borrow()
                .get("href")
                .unwrap_or_default()
                .to_string();

            let link = if href.starts_with("/wiki") {
                let link_str =
                    format!("{}://{}{}", url.scheme(), url.host_str().unwrap(), href).to_string();
                Some(link_str)
            } else {
                None
            };

            if let Some(l) = link {
                extract(l)?
            }
        }
    }

    Ok(())
}

fn extract(link: String) -> std::result::Result<(), Error> {
    println!("{:?}", link);
    let url = Url::parse(&link)?;

    let mut res = reqwest::blocking::get(url.to_owned())?;

    let mut buf: Vec<u8> = vec![];
    res.copy_to(&mut buf)?;
    let s: String = String::from_utf8(buf)?;

    let document = kuchiki::parse_html().one(s);
    // let h3_list = document.select("h3").unwrap();

    // for h3 in h3_list.filter_map(|h| with_title(h, "Diputados por distrito".to_string())) {
    //     let table = h3.as_node().next_sibling().unwrap().next_sibling().unwrap();
    //     println!("  {:?}", h3.as_node().text_contents());
    //     for tr in table.select("tr:not(:first-child)").unwrap() {
    //         let cols: Vec<kuchiki::NodeDataRef<ElementData>> = tr.as_node()
    //                     .select("td")
    //                     .unwrap()
    //                     .collect();

    //         let count = cols.len();
    //         let (first, second) = cols.split_at(count / 2);

    //         for col in &[first, second] {
    //             let person = CongressMember {
    //                 district: col[1].text_contents().parse::<u8>().unwrap_or_default(),
    //                 state: col[0].text_contents(),
    //                 name: col[2]
    //                     .as_node()
    //                     .select_first("a")
    //                     .map_or_else(|_| "".to_owned(), |v| v.text_contents())
    //             };
    
    //             println!("{:?}", person);                
    //         }
    //     }
    // }

    let h3_list = document.select("h3").unwrap();
    for h3 in h3_list.filter_map(|h| with_title(h, "Diputados por representaci".to_string())) {
        let table = h3.as_node().next_sibling().unwrap().next_sibling().unwrap();
        println!("  {:?}", h3.as_node().text_contents());
        for tr in table.select("tr:not(:first-child)").unwrap() {
            let cols: Vec<kuchiki::NodeDataRef<ElementData>> = tr.as_node()
                        .select("td")
                        .unwrap()
                        .collect();

            let count = cols.len();
            let (first, second) = cols.split_at(count / 2);
            let sub_cols = &[first, second];

            for col in sub_cols {
                let person = CongressMember {
                    district: 0,
                    state: "".to_owned(),
                    name: col[1]
                        .as_node()
                        .select_first("a")
                        .map_or_else(|_| "".to_owned(), |v| v.text_contents())
                };
    
                println!("{:?}", person);                
            }
        }
    }

    Ok(())
}

fn with_title(
    h: kuchiki::NodeDataRef<kuchiki::ElementData>,
    title: String,
) -> Option<kuchiki::NodeDataRef<kuchiki::ElementData>> {
    if h.text_contents().starts_with(&title) {
        Some(h)
    } else {
        None
    }
}
\ No newline at end of file

M src/senators.rs => src/senators.rs +32 -14
@@ 1,12 1,21 @@
use serde::{Deserialize, Serialize};

use async_std::{process, task};
use failure::Error;

use kuchiki::traits::*;
use kuchiki::{ElementData, traits::*};

use crate::{log_error, opt::Opt};

use reqwest::Url;

#[derive(Serialize, Deserialize, Debug)]
struct CongressMember {
    name: String,
    state: String,
    district: u8
}

pub fn run(opts: Opt) {
    task::block_on(async {
        if let Err(e) = process(&opts).await {


@@ 49,9 58,8 @@ async fn process(_opts: &Opt) -> std::result::Result<(), Error> {
                None
            };

            match link {
                Some(l) => extract(l)?,
                None => {}
            if let Some(l) = link {
                extract(l)?
            }
        }
    }


@@ 72,20 80,30 @@ fn extract(link: String) -> std::result::Result<(), Error> {
    let document = kuchiki::parse_html().one(s);
    let h3_list = document.select("h3").unwrap();

    for h3 in h3_list.filter_map(|h| with_title(h, "Diputados por".to_string())) {
    for h3 in h3_list.filter_map(|h| with_title(h, "Diputados por distrito".to_string())) {
        let table = h3.as_node().next_sibling().unwrap().next_sibling().unwrap();
        println!("  {:?}", h3.as_node().text_contents());
        for tr in table.select("tr:not(:first-child)").unwrap() {
            print!("      ");
            for td in tr.as_node().select("td:not(:nth-child(n+4))").unwrap() {
                print!("{:?} ", td.as_node().text_contents());
            }
            println!();
            print!("      ");
            for td in tr.as_node().select("td:nth-child(n+4)").unwrap() {
                print!("{:?} ", td.as_node().text_contents());
            let cols: Vec<kuchiki::NodeDataRef<ElementData>> = tr.as_node()
                        .select("td")
                        .unwrap()
                        .collect();

            let count = cols.len();
            let (first, second) = cols.split_at(count / 2);

            for col in &[first, second] {
                let person = CongressMember {
                    district: col[1].text_contents().parse::<u8>().unwrap_or_default(),
                    state: col[0].text_contents(),
                    name: col[2]
                        .as_node()
                        .select_first("a")
                        .map_or_else(|_| "".to_owned(), |v| v.text_contents())
                };
    
                println!("{:?}", person);                
            }
            println!();
        }
    }