okfn-brasil · bruno-schmidt · Jun 14, 2017 · Jun 14, 2017 · Jun 14, 2017 · Jun 14, 2017
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -158,8 +158,8 @@ fetch_latest_backup('data/')
 ##### Politician's relatives
 1. `research/src/get_family_names.py` gets the names of the parents of congresspeople from the congress website and saves them to `research/data/YYYY-MM-DD-congressperson_relatives.xz` (and it may save some data to `research/data/YYYY-MM-DD-congressperson_relatives_raw.xz` in case it fails to parse the names)
 
-##### Deputies Advisors
-1. `research/src/fetch_deputies_advisors.py` gets the name and point number (and act's issued place and date when available) of all advisors of current deputies from Chamber of Deputies website and saves to `research/data/YYYY-MM-DD-deputies-advisors.xz`
+##### Congresspeople Advisors
+1. `research/src/fetch_congresspeople_advisors.py` gets the name and point number (and act's issued place and date when available) of all advisors of current congresspeople from Chamber of Deputies website and saves to `research/data/YYYY-MM-DD-congresspeople-advisors.xz`
 
 ##### Federal Budget
 1. `research/src/fetch_federal_budget_datasets.py` downloads datasets files of agreements made with Federal Budget and their related amendments.  The script gets the lastest version available for each dataset, unpacks, translates columns to english and saves them into `research/data/`. The files are named as follows:
@@ -196,7 +196,7 @@ All files are named with a [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) da
 1. `research/data/YYYY-MM-DD-companies.xz`: Dataset with suppliers info containing all the fields offered in the [Federal Revenue alternative API](http://receitaws.com.br) and complemented with geolocation (latitude and longitude) gathered from Google Maps.
 1. `research/data/YYYY-MM-DD-congressperson_relatives.xz` contains data on the relatives of congresspeople and the nature of their relationship.
 1. `research/data/YYYY-MM-DD-congressperson_relatives_raw.xz` also contains data on the relatives of congresspeople, but is only created if `research/src/get_family_names.py` fails to handle some names.
-1. `research/data/YYYY-MM-DD-deputies-advisors.xz` contains data from advisors of each deputy in the current term along with the deputy number and deputy name.
+1. `research/data/YYYY-MM-DD-congresspeople-advisors.xz` contains data from advisors of each congressperson in the current term along with the congressperson number and congressperson name.
 1. `research/data/YYYY-MM-DD-sex-place-distances` contains data from the closest sex related place (cat houses, night clubs, massage parlours etc.) to each company (including distance in meters) — this dataset is just a sample (check [this notebook](research/develop/2017-04-21-cuducos-explore-sex-places-dataset.ipynb) for details).
 1. `research/data/YYYY-MM-DD-tse-candidates.xz` contains information about politicians candidacy over the last years. Can be used to extract a list of all politicians in Brazil.
 1. `research/data/YYYY-MM-DD-congressperson-details.xz` contains the birth date, gender and civil name of congresspeople.

diff --git a/research/src/fetch_deputies_advisors.py → ...arch/src/fetch_congresspeople_advisors.py b/research/src/fetch_deputies_advisors.py → ...arch/src/fetch_congresspeople_advisors.py
@@ -10,9 +10,10 @@
 
 CAMARA_URL = (
     'http://www2.camara.leg.br/transparencia/recursos-humanos/'
-    'quadro-remuneratorio/consulta-secretarios-parlamentares/'
+    'servidores/lotacao/consulta-secretarios-parlamentares/'
     'layouts_transpar_quadroremuner_consultaSecretariosParlamentares'
 )
+
 USERAGENT = (
     'Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
@@ -23,40 +24,39 @@
     'name',
     'act_issue_at',
     'act_issued_by',
-    'deputy_name',
-    'deputy_number'
+    'congressperson_name',
+    'congressperson_number'
 )
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 DATA_PATH = os.path.join(BASE_DIR, 'data')
 DATE = datetime.date.today().strftime('%Y-%m-%d')
-FILE_BASE_NAME = '{}-deputies-advisors.xz'.format(DATE)
+FILE_BASE_NAME = '{}-congresspeople-advisors.xz'.format(DATE)
 OUTPUT = os.path.join(DATA_PATH, FILE_BASE_NAME)
 
 
 def run():
-    print("Fetching deputies data…")
-    deputies_data = fetch_deputies_data()
+    print("Fetching congresspeople data…")
+    congresspeople_data = fetch_congresspeople_data()
 
     print("Preparing requests to fetch advisors data…")
-    requests_ = (get_page(deputy) for deputy in deputies_data)
-
+    requests_ = (get_page(congressperson) for congressperson in congresspeople_data)
     for page_data in send_requests(requests_):
-        deputy_with_advisors = page_data["data"]
-        deputy = {
-            "deputy_name": deputy_with_advisors["deputy_name"],
-            "deputy_number": deputy_with_advisors["deputy_number"]
+        congressperson_with_advisors = page_data["data"]
+        congressperson = {
+            "congressperson_name": congressperson_with_advisors["congressperson_name"],
+            "congressperson_number": congressperson_with_advisors["congressperson_number"]
         }
-        advisors = tuple(deputy_with_advisors["deputy_advisors"])
-        deputy_information = organize_deputy_data(deputy, advisors)
-        write_to_csv(deputy_information, OUTPUT)
+        advisors = tuple(congressperson_with_advisors["congressperson_advisors"])
+        congressperson_information = organize_congressperson_data(congressperson, advisors)
+        write_to_csv(congressperson_information, OUTPUT)
 
     print("\nDone! The file can be found at {}".format(OUTPUT))
 
 
 def send_requests(reqs):
     """
     Send all the requests in :reqs: and reads the response data to extract the
-    deputies data.  It will check if a deputy has more than one page of
+    congresspeople data.  It will check if a congressperson has more than one page of
     advisors and send new requests if True
     """
     buffer = list()
@@ -83,42 +83,42 @@ def send_requests(reqs):
         print('.', end="", flush=True)
 
 
-def fetch_deputies_data():
+def fetch_congresspeople_data():
     """
-    Returns a list with all deputies names and its numbers after parsing the
+    Returns a list with all congresspeople names and its numbers after parsing the
     `<select>` element in `CAMARA_URL`
     """
     page = requests.get(CAMARA_URL)
     tree = html.fromstring(page.content)
 
     select = tree.xpath('//select[@id="lotacao"]/option')
-    deputies_data = get_deputies_list(select)
+    congresspeople_data = get_congresspeople_list(select)
 
-    return islice(deputies_data, 1, None)  # skip first as it is "Selecione…"
+    return islice(congresspeople_data, 1, None)  # skip first as it is "Selecione…"
 
 
-def get_deputies_list(select):
+def get_congresspeople_list(select):
     """ Parses the `<select>` element in `CAMARA_URL` """
     for option in select:
         yield dict(
-            deputy_name=option.xpath("./text()")[0],
-            deputy_number=option.xpath('./@value')[0]
+            congressperson_name=option.xpath("./text()")[0],
+            congressperson_number=option.xpath('./@value')[0]
         )
 
 
-def get_page(deputy, page=1):
+def get_page(congressperson, page=1):
     """
     Returns a POST AsyncRequest object from grequests ready to be sent to
-    `CAMARA_URL` with `lotacao` field filled with `deputy_number`. Some
-    deputies can have more than 20 advisors, so some pages will have
+    `CAMARA_URL` with `lotacao` field filled with `congressperson_number`. Some
+    congresspeople can have more than 20 advisors, so some pages will have
     pagination. In this case it's necessary to inform the specific page you
     want to create a request to, otherwise a request to the first page will be
     created.
-    :deputy: (dict) A Dict with fields `deputy_name` and `deputy_number`
+    :congressperson: (dict) A Dict with fields `congressperson_name` and `congressperson_number`
     :page: (int) Defaults to 1. The page number
     """
     data = {
-        "lotacao": deputy["deputy_number"],
+        "lotacao": congressperson["congressperson_number"],
         "b_start:int": (page - 1) * 20  # page 1 = 0, page 2 = 20, page 3 = 40
     }
     return grequests.post(CAMARA_URL, data=data)
@@ -128,8 +128,8 @@ def extract_data_from_page(page):
     """
     Extracts all relevant data from a page and returns it as Dict. Each
     information is inside a key in the dict as following:
-    - Deputy name, number and advisors inside the key `data` as `deputy_name`,
-      `deputy_number` and `deputy_advisors` respectively.
+    - congressperson name, number and advisors inside the key `data` as `congressperson_name`,
+      `congressperson_number` and `congressperson_advisors` respectively.
     - Number of pages of advisors; as `number_of_pages`
     - The current page number as `current_page`
     - If it has more pages of advisors as `has_next_page`
@@ -138,25 +138,25 @@ def extract_data_from_page(page):
     number_of_pages = extract_number_of_pages(html_tree)
     current_page = extract_current_page(html_tree)
 
-    tbody = html_tree.xpath('//tbody[@class="coresAlternadas"]/tr')
-    deputy_advisors = tuple(extract_adivisors(tbody))
+    tbody = html_tree.xpath('//table[@class="tabela-padrao-bootstrap"]/tbody/tr')
+    congressperson_advisors = tuple(extract_adivisors(tbody))
 
     select = html_tree.xpath('//select[@id="lotacao"]/option[@selected]')[0]
-    deputy_data = {
-        "deputy_name": select.xpath('./text()')[0],
-        "deputy_number": select.xpath("./@value")[0],
-        "deputy_advisors": deputy_advisors
+    congressperson_data = {
+        "congressperson_name": select.xpath('./text()')[0],
+        "congressperson_number": select.xpath("./@value")[0],
+        "congressperson_advisors": congressperson_advisors
     }
 
     # Some "Data de Publicação do Ato" are empty in `CAMARA_URL` and xpath are
     # not adding an 'empty string' inside the returned array, so we are adding
     # it manualy below
-    for advisor_info in deputy_advisors:
+    for advisor_info in congressperson_advisors:
         if len(advisor_info) == 3:
             advisor_info.append("Empty")
 
     return {
-        'data': deputy_data,
+        'data': congressperson_data,
         'number_of_pages': number_of_pages,
         'current_page': current_page,
         'has_next_page': False if current_page == number_of_pages else True
@@ -187,14 +187,14 @@ def extract_number_of_pages(tree):
     return 1 if result == 0 else int(result)
 
 
-def organize_deputy_data(deputy, advisors):
+def organize_congressperson_data(congressperson, advisors):
     """
-    Organizes all the deputies information in a list. Use this function to
+    Organizes all the congresspeople information in a list. Use this function to
     prepare data to be written to CSV format.
-    :deputy: (dict) A dict with keys `deputy_name` and `deputy_number`
+    :congressperson: (dict) A dict with keys `congressperson_name` and `congressperson_number`
     :advisors: (tuple) lists with advisors data.
     """
-    name, number = deputy["deputy_name"], deputy["deputy_number"]
+    name, number = congressperson["congressperson_name"], congressperson["congressperson_number"]
     if not advisors:
         values = ('', '', '', '', name, number)
         yield dict(zip(FIELDNAMES, values))
@@ -214,7 +214,7 @@ def extract_adivisors(tbody):
 def write_to_csv(data, output):
     """
     Writes `data` to `output`
-    :data: (list) list with organized deputy information ready to be written
+    :data: (list) list with organized congressperson information ready to be written
     :output: (string) the full path to a file where :data: should be written
     """
     with lzma.open(output, "at") as fh: