I am from Venezuela and some months ago we had presidential elections.
The opposition, which is in someway my tendency (I am tired of this goverment), called a fraud. So in order to check that my self i made a crawler for the web page (www.cne.gob.ve/resultado_presidencial_2013/r/1/reg_000000.html) with the results in order to gather and analyze them.
#!/usr/bin/python # -*- coding: utf-8 -*- import requests import re import threading paths_re = re.compile('<s*li\s+class="region-nav-item"><a\s+id="region_ref"\s+href="(.*)"\s*>(.*)<\s*/a\s*>\s*<\s*/li\s*>', re.IGNORECASE) values_re = re.compile('<tr\s+class="tbsubtotalrow"\s*>\s*<td\s+class="lightRowContent"\s+align="center"\s*>\s*<img\s+src=".+"\s+alt=""\s+width="50"\s+/>\s* \s*<td\s+class="lightRowContent">\s*<table\s+width="100%">\s*<\s*tr\s*>\s*<\s*td\s+align="left"\s*>\s*<\s*a\s+href="javascript:showCandidateInfo\(\d\);"\s*>(.*)<\s*/a\s*>\s*<\s*/td\s*>\s*<\s*/tr\s*>(\s*<\s*tr\s*>\s*<\s*td\s*>\s*<\s*font\s+color="\#990000"\s*>\s*Adjudicado\s*<\s*/font\s*>\s*<\s*/td\s*>\s*<\s*/tr\s*>\s*|\s*)<\s*/table\s*>\s*<\s*/td\s*>\s*<\s*td\s+class="lightRowContent"\s+align="right"\s*>\s*<\s*span\s*>(.*)<\s*/span\s*>\s*<\s*/td>\s*<\s*td\s+class="lightRowContent"\s+align="right"\s*>\s*<\s*span\s*>(.*)<\s*/span\s*>\s*<\s*/td>\s*', re.IGNORECASE) filter_re = re.compile('%|\.|\s') main_url = 'http://www.cne.gob.ve/resultado_presidencial_2013/r/1/' depth_map = ['All', 'State', 'Municipality', 'County', 'Center', 'Table'] separator = '\t' outfiles = [] outfilename = 'electionsdata' qttythread = 6 rthreads = [] generalfile = open(outfilename, 'w') firstorder = dict() def crawl_page(url, region, depth, file_to_write): page = requests.get(url) paths = paths_re.findall(page.text) values = values_re.findall(page.text) if region == separator: names = map(lambda v: v[0], values) counter = 0 for name in names: firstorder[name] = counter counter += 1 firstorder = map(lambda v: v[0], values) print_headers(file_to_write, names) print_info(file_to_write, region, values, depth) if region == separator: chunked_lists = chunk_list(paths, (len(paths) + (qttythread - len(paths) % qttythread)) / qttythread) threadn = 0 for chunked_list in chunked_lists: t = threading.Thread(target=recursive_calls, args=(chunked_list, region, depth, outfiles[threadn], )) t.start() rthreads.append(t) threadn += 1 else: recursive_calls(paths, region, depth, file_to_write) def recursive_calls(paths, region, depth, file_to_write): savedregion = region for path in paths: pathsurl = path[0] pathsfor = path[1] region = savedregion + separator + pathsfor print region crawl_page(main_url + pathsurl, region, depth + 1, file_to_write) def chunk_list(l, chunksize): result = [] for i in xrange(0, len(l), chunksize): result.append(l[i:i+chunksize]) return result def print_headers(file_to_write, names): result = 'Type' for region_type in depth_map: result += separator + region_type for name in names: result += separator + name + ' value\t' + name + ' %' file_to_write.write(result + '\n') def print_info(file_to_write, region, values, depth): order_values(values) result = depth_map[depth] + region for i in xrange(len(depth_map) - depth - 1): result += separator for value in values: result += separator + clean_number_srt(value[2]) + separator + clean_number_srt(value[3]) file_to_write.write(result + '\n') def order_values(values): position = 0 while position < len(values): needed_position = firstorder[values[position][0]] if position == needed_position: position += 1 else: swap(values, position, needed_position) def swap(l, i, j): aux = l[i] l[i] = l[j] l[j] = aux def clean_number_srt(number_str): return filter_re.sub('', number_str).replace(',', '.') if __name__ == "__main__": for i in xrange(qttythread): outfile = open(outfilename + str(i), 'w') outfiles.append(outfile) crawl_page(main_url + 'reg_000000.html', separator, 0, generalfile) for rthread in rthreads: rthread.join() for outfile in outfiles: outfile.close() generalfile.close()
In the end, i didn’t run it. I think this final version is pretty complete and can be used for another variety of things, thus I am sharing the code :). It use a number of threads and check the pages in a recursive way writing to different files. The regular expressions are pretty ugly, but i didn’t want to use a HTML parser or something like that.