#!/usr/bin/env python
import argparse
from bs4 import BeautifulSoup
import urllib2
import re
import csv
import sys
import os.path
import tempfile
import itertools

def readCL():
    parser = argparse.ArgumentParser()
    parser.add_argument("-g","--grep",help="item to grep from the website")
    parser.add_argument("-l","--location",nargs="*",help="location within the page to look. given as a path through the DOM tree. The path is a csv list of steps where '-' is a wildcard. eg: 1,1,0,13,9,1,5,3,1,-,0")
    parser.add_argument("-u", "--url", help="url to look in")
    parser.add_argument("-f", "--html_file")
    parser.add_argument("-r", "--relative", help="grep the -g arg position relative to -r arg instead of absolute position on the page. Useful if the desired section of the page remains the same but other layout varies.")
    parser.add_argument("--print_url", action="store_true", help="print link urls instead of link text")
    # parser.add_argument("--no_cache",help="By default webgrep caches the last webpage you downloaded for faster rerunning. Use no_cache to override.")
    parser.add_argument("--phantomjs", action="store_true", help="use phantomjs to parse the website for DOM elements that are loaded using javascript")
    parser.add_argument("-v", "--verbose", action="store_true")
    args = parser.parse_args()
    if args.location:
        path = [l.strip().split(",") for l in args.location]
    else:
        path = None
    args.no_cache = True
    return args.grep, args.url, args.html_file, path, args.relative, args.print_url, args.no_cache, args.phantomjs, args.verbose


def fix_broken_pipe():
    #following two lines solve 'Broken pipe' error when piping
    #script output into head
    from signal import signal, SIGPIPE, SIG_DFL
    signal(SIGPIPE,SIG_DFL)

    
def path_to(node):
    ancestor_list = list(node.parents)[::-1] + [node] #starting from top of the tree
    return [n.parent.contents.index(n) for n in ancestor_list[1:]]

def relative_path(path1, path2):
    """
    path on the tree to get from node1 to node2
    """
    #  from 1,2,3   to 1,2,3,7,0,2 =    7,0,2
    #  from 1,2,3,4 to 1,2,3,7,0,2 = -1,7,0,2
    if path2[:len(path1)] == path1:
        return path2[len(path1):]
    else:
        return [-1] + relative_path(path1[:-1],path2)

    
def follow_path(node,path,nested=True):
    return list(_follow_path(node,path,nested))


def _follow_path(node, path, nested=True):
    #TODO(jtrigg) why is nested here if it's always set to True in the code? ok to get rid of it?
    if not path:
        yield node
    elif path[0] == "-":
        if not hasattr(node,"contents"):
            return
        for n1 in node.contents:
            #don't encase the last '-' in path because those are single items
            if nested and "-" in path[1:]:
                l = [n2 for n2 in follow_path(n1,path[1:],nested)]
                #don't yield l if we didn't find anything
                if l:
                    yield l
                else:
                    continue
            else:
                for n2 in follow_path(n1,path[1:],nested):
                    yield n2
    elif path[0] == "-1":
        for n in follow_path(node.parent, path[1:],nested):
            yield n
    else:
        step = int(path[0])
        if not hasattr(node,"contents"): return
        if len(node.contents) <= step: return
        n1 = node.contents[step]
        for n2 in follow_path(n1, path[1:],nested=True):
            yield n2


def trim_with_ellipses(string, max_len):
    string = string.replace("\n","").replace("\r","")
    if len(string) > max_len:
        return string[:(max_len-3)] + "..."
    else:
        return string

def web_grep(item, url):
    soup = url_to_soup(url)
    return list(_web_grep(item, soup))

def _find_nodes(item, soup):
    nodes1 = soup.find_all(text=re.compile(item))
    nodes2 = soup.find_all(html=re.compile(item))
    nodes = nodes1 + nodes2
    nodes = [find_proper_node(n) for n in nodes]
    return nodes



def find_proper_node(node):
    """Takes a node returned from soup.find_all and finds the parent or grandparent node we actually wanted.
    For example soup.find_all returns a separate node of the text of a link instead of the link node itself.
    """
    if (node.parent.name in ["a","td"] and len(node.parent.contents) == 1):
        return find_proper_node(node.parent)
    else:
        return node

def node_to_str(node, print_url=False, max_length=50):
    if getattr(node, "name", "") in ["td"] and len(node.contents) == 1:
        return node_to_str(node.contents[0], print_url=print_url, max_length=max_length)
    if "href" in getattr(node,"attrs",[]) and print_url:
        s = unicode(node["href"]).encode("utf-8")
    elif hasattr(node,"text"):
        s = unicode(node.text).encode("utf-8")
    else:
        s = unicode(node).encode("utf-8")
    s = s.replace("\n","")
    if max_length:
        return trim_with_ellipses(s,max_length)
    else:
        return s
    
def _web_grep(item, soup, relative_node=None):
    nodes = _find_nodes(item, soup)
    max_levels = 3
    max_sibs_per_level = 3
    for n in nodes:
        path = path_to(n)
        nearby = list(_get_all_nearby(soup, path, max_levels, max_sibs_per_level))
        nearby = nearby + [""] * (max_levels * max_sibs_per_level - len(nearby))
        if relative_node:
            output_path = relative_path(path_to(relative_node), path_to(n))
        else:
            output_path = path
        r_out = [node_to_str(n)] + [" " + ",".join([str(i) for i in output_path])] + [node_to_str(i) for i in nearby]
        yield r_out

def _get_all_nearby(soup, path, max_levels, max_sibs_per_level):
    all_sib_lists = _get_siblings_by_depth(soup, path, max_sibs_per_level)
    all_sib_lists = list(itertools.islice(all_sib_lists, max_levels))
    for sib_list in all_sib_lists:
        for sib in sib_list:
            yield sib
        
def _get_siblings_by_depth(soup, path, max_sibs_per_depth = 3):
    for depth in range(len(path))[::-1]:
        sib_list = _get_nearest_siblings(soup, path, depth)
        sib_list = list(itertools.islice(sib_list, max_sibs_per_depth))
        if sib_list:
            yield sib_list

def _get_nearest_siblings(soup, path, depth):
    """Return siblings of the path location starting with the nearest
    eg 
    [1,3,2] -> [[1,3,1], [1,3,3], [1,3,0], [1,3,4], [1,3,5], [1,3,6]]
    """
    parent_node = follow_path(soup, path[:depth])[0]
    siblings_cnt = len(parent_node.contents)
    sibling_index = path[depth]
    for diff1 in range(1,siblings_cnt):
        for mult in [1,-1]:
            new_sibling_index = sibling_index + (mult * diff1)
            if new_sibling_index <  0 or \
               new_sibling_index >= siblings_cnt: 
                continue
            output = follow_path(soup, path[:depth] + [new_sibling_index] + path[depth+1:])
            if output:
                yield output[0]

def get_href(elt):
    #TODO(jtrigg) 20160113 is this still needed?
    if "href" in getattr(elt,"attrs",[]):
        return elt["href"]
    elif "href" in getattr(elt.parent,"attrs",[]):
        return elt.parent["href"]

def write_path(path, url, print_url):
    url_to_soup(url)
    return list(_write_path(path, soup, print_url))

def _write_path(path, soup, print_url):
    l = list(follow_path(soup,path))
    for elt in l:
        if not isinstance(elt,list):
            elt = [elt]
        r = [node_to_str(n, print_url=print_url) for n in elt]
        yield r

def get_cached_soup(no_cache, print_url, get_soup_fn):
    cache_file = "/tmp/.webgrep"
    if not no_cache:
        import cPickle
        if os.path.exists(cache_file):
            old_url, old_soup = cPickle.load(open(cache_file,'rb'))
            if old_url == print_url:
                soup = old_soup
            else:
                soup = get_soup_fn(url)
                save_soup(url, soup, cache_file)
        else:
            soup = get_soup_fn(url)
            save_soup(url, soup, cache_file)
    else:
        soup = get_soup_fn(url)
    return soup


def save_soup(url, soup, save_file):
    to_save = [url,soup]
    import cPickle
    try:
        cPickle.dump(to_save,open(save_file,'wb'))
    except RuntimeError:
        sys.stderr.write("WARNING: couldn't save website" + "\n")
        os.remove(save_file)

def html_to_soup(html):
    try:
        soup = BeautifulSoup(html, "lxml")
    except:
        soup = BeautifulSoup(html, "html.parser")
    return soup
    
def url_to_soup(url):
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0')]
    html = opener.open(url).read()
    return html_to_soup(html)
    
def run(cmd):
    import subprocess
    pipes = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    stdout, stderr = pipes.communicate()
    return_code = pipes.returncode
    return stdout, stderr, return_code

def get_phantomjs_soup(url):
    print_url_js_code = """var page = require('webpage').create();
page.open('"""+url+"""', function () {
    console.log(page.content);
    phantom.exit();
});
"""
    print_url_js_file = "/tmp/print_html.js"
    fd, f_path = tempfile.mkstemp()
    try:
        with os.fdopen(fd, "w") as f_in:
            f_in.write(print_url_js_code)
        html, _, _ = run("phantomjs " + f_path)
        soup = html_to_soup(html)
        return soup
    finally:
        os.remove(f_path)

def get_soup(url, phantomjs, html_file, no_cache):
    if url and phantomjs:
        soup = get_phantomjs_soup(url)
    elif url:
        soup = get_cached_soup(no_cache, url, url_to_soup)
    elif html_file:
        soup = html_to_soup(open(html_file).read())
    else:
        raise Exception("Couldn't find input url or html_file!")
    return soup

def main_grep(grep, soup, relative, verbose):
    if relative:
        relative_node = _find_nodes(relative, soup)[0]
    else:
        relative_node = None
    csv_header = ["match","location"] + ["nearby"+str(i) for i in range(9)]
    csv_rows = [csv_header] + list(_web_grep(grep, soup, relative_node))
    if not verbose:
        csv_rows = [r[:2] for r in csv_rows] #drop neighbor columns
    csv.writer(sys.stdout, lineterminator= '\n').writerows(csv_rows)
    
def main_print_path(all_paths, soup, relative, print_url):
    csv_rows = []
    for path in all_paths:
        if path.count("-") > 2:
            raise Exception("ERROR: more than two wildcards ('-') in step list")
        if relative:
            path_relative = list(_web_grep(relative, soup))[0][1].split(',')
            path = path_relative + path
        csv_rows += _write_path(path, soup, print_url)
    csv.writer(sys.stdout, lineterminator = '\n').writerows(csv_rows)

if __name__ == "__main__":
    grep, url, html_file, all_paths, relative, print_url, no_cache, phantomjs, verbose = readCL()
    fix_broken_pipe()
    soup = get_soup(url, phantomjs, html_file, no_cache)
    if grep:
        main_grep(grep, soup, relative, verbose)
    elif all_paths:
        main_print_path(all_paths, soup, relative, print_url)
