M src/deputies.rs => src/deputies.rs +148 -2
@@ 1,5 1,151 @@
-use crate::opt::Opt;
+use serde::{Deserialize, Serialize};
-pub fn run(_opts: Opt) {
+use async_std::{process, task};
+use failure::Error;
+use kuchiki::{ElementData, traits::*};
+
+use crate::{log_error, opt::Opt};
+
+use reqwest::Url;
+
+#[derive(Serialize, Deserialize, Debug)]
+struct CongressMember {
+ name: String,
+ state: String,
+ district: u8
+}
+
+pub fn run(opts: Opt) {
+ task::block_on(async {
+ if let Err(e) = process(&opts).await {
+ log_error(&e);
+ process::exit(1);
+ };
+ });
+}
+
+async fn process(_opts: &Opt) -> std::result::Result<(), Error> {
+ let url =
+ Url::parse(
+ "https://es.wikipedia.org/wiki/Anexo:Congresos_y_Legislaturas_del_Congreso_de_la_Uni%C3%B3n_de_M%C3%A9xico"
+ )?;
+
+ let mut res = reqwest::blocking::get(url.to_owned())?;
+
+ let mut buf: Vec<u8> = vec![];
+ res.copy_to(&mut buf)?;
+ let s: String = String::from_utf8(buf)?;
+
+ let table_selector = ".wikitable";
+ let document = kuchiki::parse_html().one(s);
+
+ for table_match in document.select(table_selector).unwrap() {
+ let node = table_match.as_node();
+ for a in node.select("td:first-child a").unwrap() {
+ let href = a
+ .attributes
+ .borrow()
+ .get("href")
+ .unwrap_or_default()
+ .to_string();
+
+ let link = if href.starts_with("/wiki") {
+ let link_str =
+ format!("{}://{}{}", url.scheme(), url.host_str().unwrap(), href).to_string();
+ Some(link_str)
+ } else {
+ None
+ };
+
+ if let Some(l) = link {
+ extract(l)?
+ }
+ }
+ }
+
+ Ok(())
+}
+
+fn extract(link: String) -> std::result::Result<(), Error> {
+ println!("{:?}", link);
+ let url = Url::parse(&link)?;
+
+ let mut res = reqwest::blocking::get(url.to_owned())?;
+
+ let mut buf: Vec<u8> = vec![];
+ res.copy_to(&mut buf)?;
+ let s: String = String::from_utf8(buf)?;
+
+ let document = kuchiki::parse_html().one(s);
+ // let h3_list = document.select("h3").unwrap();
+
+ // for h3 in h3_list.filter_map(|h| with_title(h, "Diputados por distrito".to_string())) {
+ // let table = h3.as_node().next_sibling().unwrap().next_sibling().unwrap();
+ // println!(" {:?}", h3.as_node().text_contents());
+ // for tr in table.select("tr:not(:first-child)").unwrap() {
+ // let cols: Vec<kuchiki::NodeDataRef<ElementData>> = tr.as_node()
+ // .select("td")
+ // .unwrap()
+ // .collect();
+
+ // let count = cols.len();
+ // let (first, second) = cols.split_at(count / 2);
+
+ // for col in &[first, second] {
+ // let person = CongressMember {
+ // district: col[1].text_contents().parse::<u8>().unwrap_or_default(),
+ // state: col[0].text_contents(),
+ // name: col[2]
+ // .as_node()
+ // .select_first("a")
+ // .map_or_else(|_| "".to_owned(), |v| v.text_contents())
+ // };
+
+ // println!("{:?}", person);
+ // }
+ // }
+ // }
+
+ let h3_list = document.select("h3").unwrap();
+ for h3 in h3_list.filter_map(|h| with_title(h, "Diputados por representaci".to_string())) {
+ let table = h3.as_node().next_sibling().unwrap().next_sibling().unwrap();
+ println!(" {:?}", h3.as_node().text_contents());
+ for tr in table.select("tr:not(:first-child)").unwrap() {
+ let cols: Vec<kuchiki::NodeDataRef<ElementData>> = tr.as_node()
+ .select("td")
+ .unwrap()
+ .collect();
+
+ let count = cols.len();
+ let (first, second) = cols.split_at(count / 2);
+ let sub_cols = &[first, second];
+
+ for col in sub_cols {
+ let person = CongressMember {
+ district: 0,
+ state: "".to_owned(),
+ name: col[1]
+ .as_node()
+ .select_first("a")
+ .map_or_else(|_| "".to_owned(), |v| v.text_contents())
+ };
+
+ println!("{:?}", person);
+ }
+ }
+ }
+
+ Ok(())
+}
+
+fn with_title(
+ h: kuchiki::NodeDataRef<kuchiki::ElementData>,
+ title: String,
+) -> Option<kuchiki::NodeDataRef<kuchiki::ElementData>> {
+ if h.text_contents().starts_with(&title) {
+ Some(h)
+ } else {
+ None
+ }
}=
\ No newline at end of file
M src/senators.rs => src/senators.rs +32 -14
@@ 1,12 1,21 @@
+use serde::{Deserialize, Serialize};
+
use async_std::{process, task};
use failure::Error;
-use kuchiki::traits::*;
+use kuchiki::{ElementData, traits::*};
use crate::{log_error, opt::Opt};
use reqwest::Url;
+#[derive(Serialize, Deserialize, Debug)]
+struct CongressMember {
+ name: String,
+ state: String,
+ district: u8
+}
+
pub fn run(opts: Opt) {
task::block_on(async {
if let Err(e) = process(&opts).await {
@@ 49,9 58,8 @@ async fn process(_opts: &Opt) -> std::result::Result<(), Error> {
None
};
- match link {
- Some(l) => extract(l)?,
- None => {}
+ if let Some(l) = link {
+ extract(l)?
}
}
}
@@ 72,20 80,30 @@ fn extract(link: String) -> std::result::Result<(), Error> {
let document = kuchiki::parse_html().one(s);
let h3_list = document.select("h3").unwrap();
- for h3 in h3_list.filter_map(|h| with_title(h, "Diputados por".to_string())) {
+ for h3 in h3_list.filter_map(|h| with_title(h, "Diputados por distrito".to_string())) {
let table = h3.as_node().next_sibling().unwrap().next_sibling().unwrap();
println!(" {:?}", h3.as_node().text_contents());
for tr in table.select("tr:not(:first-child)").unwrap() {
- print!(" ");
- for td in tr.as_node().select("td:not(:nth-child(n+4))").unwrap() {
- print!("{:?} ", td.as_node().text_contents());
- }
- println!();
- print!(" ");
- for td in tr.as_node().select("td:nth-child(n+4)").unwrap() {
- print!("{:?} ", td.as_node().text_contents());
+ let cols: Vec<kuchiki::NodeDataRef<ElementData>> = tr.as_node()
+ .select("td")
+ .unwrap()
+ .collect();
+
+ let count = cols.len();
+ let (first, second) = cols.split_at(count / 2);
+
+ for col in &[first, second] {
+ let person = CongressMember {
+ district: col[1].text_contents().parse::<u8>().unwrap_or_default(),
+ state: col[0].text_contents(),
+ name: col[2]
+ .as_node()
+ .select_first("a")
+ .map_or_else(|_| "".to_owned(), |v| v.text_contents())
+ };
+
+ println!("{:?}", person);
}
- println!();
}
}