import requests
import json
import sys
import multiprocessing

#the size of requests to be made in parallel
chunksize = int(sys.argv[1])
#the index to start with
initCounter = int(sys.argv[2])

#the function for writing the raw results
def writeState(state):
	print("Writing state")
	statefile = open("crawl-state.json","w")
	statefile.write(json.dumps(state))
	statefile.close()
	statefile = open("crawl-state-red.json","w")
	statefile.write(json.dumps(state))
	statefile.close()

#the function for reading previous results
def getState():
	statefile = open("crawl-state.json","r")
	state = json.loads(statefile.read())
	statefile.close()
	return state

#read the list of sites to crawl
domains = []
fileHandle = open("top-1m.csv","r")
line = fileHandle.readline()
while (line):
	splited = line.split(",")

	domains.append([int(splited[0]),splited[1][:-1]])
	line = fileHandle.readline()
fileHandle.close()

# the function for checking if the site has scripts or storing the error
def getDomainInfo(domain,outQueue):
	try:
		websiteSrc = requests.get("http://"+domain[1],).text

		split = websiteSrc.split("<script")
		
		if len(split) > 1:
			result = (str(domain[0]),[domain[1],"Sucess","script"])
		else:
			result = (str(domain[0]),[domain[1],"Sucess","noscript"])
	except Exception as e:
		result = (str(domain[0]), [domain[1],"Fail",str(e)])

	outQueue.put(result)

	print("No: "+str(domain[0]))
	print(result)

#load the results from last run
results = getState()

#continue crawling forever
while True:
	#loop over all sites
	counter = initCounter
	while counter < 1000000:
		#prepare this block of sites
		outQueue = multiprocessing.Queue()
		processes = []

		#crawl this block of sites in parallel
		print("dispatching jobs")
		processTask = {}
		endCounter = counter+chunksize
		if endCounter > 1000000-1:
			endCounter = 1000000-1
		while counter<endCounter:
			domain = domains[counter]
			counter += 1

			if str(domain[0]) in results.keys():
				if results[str(domain[0])][1] == "Sucess":
					print("skip "+domain[1])
					continue

			process = multiprocessing.Process(target=getDomainInfo,args=(domain,outQueue,))
			processTask[process] = counter
			process.daemon = True
			process.start()
			processes.append(process)

		#wait for the results to come in
		print("waiting for finish")
		print(processes)
		for process in processes:
			process.join(timeout=100)
			if process.is_alive():
				process.terminate()
				outQueue.put((str(domains[processTask[process]][0]), [domains[processTask[process]][1],"Fail","Processing Timeout"]))

		#add the results of this block of sites to all results
		print("adding to state")
		while not outQueue.empty():
			item = outQueue.get()
			results[item[0]] = item[1]

		#save the results from this block of sites
		writeState(results)

	#save the results from this run
	writeState(results)

	#reset the index to start with
	initCounter = 0
