Useful script i made some months ago

I am from Venezuela and some months ago we had presidential elections.

The opposition, which is in someway my tendency (I am tired of this goverment), called a fraud. So in order to check that my self i made a crawler for the web page (www.cne.gob.ve/resultado_presidencial_2013/r/1/reg_000000.html) with the results in order to gather and analyze them.

#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import re
import threading

paths_re = re.compile('<s*li\s+class="region-nav-item"><a\s+id="region_ref"\s+href="(.*)"\s*>(.*)<\s*/a\s*>\s*<\s*/li\s*>', re.IGNORECASE)
values_re = re.compile('<tr\s+class="tbsubtotalrow"\s*>\s*<td\s+class="lightRowContent"\s+align="center"\s*>\s*<img\s+src=".+"\s+alt=""\s+width="50"\s+/>\s*
\s*<td\s+class="lightRowContent">\s*<table\s+width="100%">\s*<\s*tr\s*>\s*<\s*td\s+align="left"\s*>\s*<\s*a\s+href="javascript:showCandidateInfo\(\d\);"\s*>(.*)<\s*/a\s*>\s*<\s*/td\s*>\s*<\s*/tr\s*>(\s*<\s*tr\s*>\s*<\s*td\s*>\s*<\s*font\s+color="\#990000"\s*>\s*Adjudicado\s*<\s*/font\s*>\s*<\s*/td\s*>\s*<\s*/tr\s*>\s*|\s*)<\s*/table\s*>\s*<\s*/td\s*>\s*<\s*td\s+class="lightRowContent"\s+align="right"\s*>\s*<\s*span\s*>(.*)<\s*/span\s*>\s*<\s*/td>\s*<\s*td\s+class="lightRowContent"\s+align="right"\s*>\s*<\s*span\s*>(.*)<\s*/span\s*>\s*<\s*/td>\s*', re.IGNORECASE)
filter_re = re.compile('%|\.|\s')
main_url = 'http://www.cne.gob.ve/resultado_presidencial_2013/r/1/'
depth_map = ['All', 'State', 'Municipality', 'County', 'Center', 'Table']
separator = '\t'
outfiles = []
outfilename = 'electionsdata'
qttythread = 6
rthreads = []
generalfile = open(outfilename, 'w')
firstorder = dict()

def crawl_page(url, region, depth, file_to_write):
    page = requests.get(url)
    paths = paths_re.findall(page.text)
    values = values_re.findall(page.text)

    if region == separator:
        names = map(lambda v: v[0], values)
        counter = 0
        for name in names:
	        firstorder[name] = counter
	        counter += 1
		firstorder = map(lambda v: v[0], values)
        print_headers(file_to_write, names)

    print_info(file_to_write, region, values, depth)

    if region == separator:
        chunked_lists = chunk_list(paths, (len(paths) + (qttythread - len(paths) % qttythread)) / qttythread)
        threadn = 0
        for chunked_list in chunked_lists:
            t = threading.Thread(target=recursive_calls, args=(chunked_list, region, depth, outfiles[threadn], ))
            t.start()
            rthreads.append(t)
            threadn += 1
    else:
        recursive_calls(paths, region, depth, file_to_write)

def recursive_calls(paths, region, depth, file_to_write):
    savedregion = region
    for path in paths:
        pathsurl = path[0]
        pathsfor = path[1]
        region =  savedregion + separator + pathsfor
        print region
        crawl_page(main_url + pathsurl, region, depth + 1, file_to_write)

def chunk_list(l, chunksize):
    result = []
    for i in xrange(0, len(l), chunksize):
        result.append(l[i:i+chunksize])
    return result

def print_headers(file_to_write, names):
    result = 'Type'
    for region_type in depth_map:
        result += separator + region_type
    for name in names:
        result += separator + name + ' value\t' + name + ' %'
    file_to_write.write(result + '\n')

def print_info(file_to_write, region, values, depth):
	order_values(values)
    result = depth_map[depth] + region
    for i in xrange(len(depth_map) - depth - 1):
        result += separator
    for value in values:
        result += separator + clean_number_srt(value[2]) + separator + clean_number_srt(value[3])
    file_to_write.write(result + '\n')

def order_values(values):
    position = 0
    while position < len(values):
        needed_position = firstorder[values[position][0]]
	    if position == needed_position:
            position += 1
	    else:
            swap(values, position, needed_position)

def swap(l, i, j):
	aux = l[i]
	l[i] = l[j]
	l[j] = aux

def clean_number_srt(number_str):
    return filter_re.sub('', number_str).replace(',', '.')

if __name__ == "__main__":
    for i in xrange(qttythread):
        outfile = open(outfilename + str(i), 'w')
        outfiles.append(outfile)

    crawl_page(main_url + 'reg_000000.html', separator, 0, generalfile)

    for rthread in rthreads:
        rthread.join()

    for outfile in outfiles:
        outfile.close()
    generalfile.close()

In the end, i didn’t run it. I think this final version is pretty complete and can be used for another variety of things, thus I am sharing the code :). It use a number of threads and check the pages in a recursive way writing to different files. The regular expressions are pretty ugly, but i didn’t want to use a HTML parser or something like that.