python web scraping (stockfetcher.com pull)

Python code below,

It fetches all contents from 5 urls, general discussion, filter exchange, public filters, stock picks and indicators.

Destination dir: /tmp
File type: csv

 

#!/usr/bin/env python
from lxml import html
import requests
import csv
import sys
import humanize

reload(sys)
sys.setdefaultencoding('utf-8')


def printable(input_str):
    return ''.join([i if ord(i) < 128 else ' ' for i in input_str])


def process_url_content(input_url, output_file):
    print "\nFetching contents from " + input_url
    buy_tx_page = requests.get(input_url)
    print "\nDownloaded " + humanize.naturalsize(len(buy_tx_page.content), gnu=True) + " bytes."
    tree = html.fromstring(buy_tx_page.content)

    subject = tree.xpath('//table[@class="table table-condensed"]/tbody/tr/td[1]/a/text()')
    rel_link = tree.xpath('//table[@class="table table-condensed"]/tbody/tr/td[1]/a/@href')
    user_name = tree.xpath('//table[@class="table table-condensed"]/tbody/tr/td[2]/text()')
    replies = tree.xpath('//table[@class="table table-condensed"]/tbody/tr/td[3]/text()')
    last_date = tree.xpath('//table[@class="table table-condensed"]/tbody/tr/td[4]/text()')

    if len(subject) > 0:
        output = open(output_file, 'a')  # 'wt'
        loop_len = len(subject)
        i = 0
        try:
            writer = csv.writer(output)
            writer.writerow(('Subject', 'URL', 'Clickable URL', 'Author', 'Replies', 'Last'))
            try:
                while i < loop_len:
                    http_url = 'http://www.stockfetcher.com' + rel_link[i]
                    writer.writerow((subject[i], http_url, '=HYPERLINK("' + http_url + '")', user_name[i], replies[i],
                                     printable(last_date[i])))
                    i += 1
            except IndexError:
                pass
        finally:
            output.close()

        print "\nWrote " + str(i) + " records in " + output_file

        return 1
    else:
        return 0


# http://www.stockfetcher.com/forums2/General-Discussion
# http://www.stockfetcher.com/forums2/Filter-Exchange
# http://www.stockfetcher.com/forums2/Stock-Picks
# http://www.stockfetcher.com/forums2/Indicators
# http://www.stockfetcher.com/forums2/Public-Filters

url_list = ['http://www.stockfetcher.com/forums2/General-Discussion',
            'http://www.stockfetcher.com/forums2/Filter-Exchange',
            'http://www.stockfetcher.com/forums2/Stock-Picks',
            'http://www.stockfetcher.com/forums2/Indicators',
            'http://www.stockfetcher.com/forums2/Public-Filters']


for link in url_list:
    output_filename = '/tmp/' + link.rsplit('/', 1)[-1] + '.csv'

    func_ret = 1
    running_val = 0
    jump_val = 50
    while func_ret == 1:
        func_ret = process_url_content(link + '/' + str(running_val), output_filename)
        running_val = running_val + jump_val

 

You may also like...

Leave a Reply

Your email address will not be published. Required fields are marked *