-
Notifications
You must be signed in to change notification settings - Fork 0
/
netcen.py
53 lines (44 loc) · 1.4 KB
/
netcen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# importing the requests library
# taken help from https://www.pythonforbeginners.com/python-on-the-web/web-scraping-with-beautifulsoup
import requests
from bs4 import BeautifulSoup
global redlist
redlist = []
def directorydots(parenturl, nextURL):
print("directorydots")
print (nextURL)
def crawler(url,base,visited):
r = requests.get(url)
if r.status_code != 200:
return
data = r.text
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
nextURL = link.get('href')
# print("nextURL = ",nextURL)
newURL = ''
if (nextURL != "") and (nextURL!="#") and (nextURL!= None) and ("mailto" not in nextURL ) and ("javascript" not in nextURL) and ("irc" not in nextURL):
if base in nextURL: # if nextURl already a full URL, for recursion
newURL = nextURL
if nextURL[0] == "/":
newURL = base[0:-1] + nextURL
if nextURL[0] == ".":
directorydots(url,nextURL)
continue
if "http" != nextURL[0:4]:
newURL = base + nextURL
if newURL not in visited and newURL != '':
visited.append(newURL)
print(newURL)
crawler(newURL,base,visited)
return visited
def main():
# URL = "https://www.syedfaaizhussain.com/"
# URL = "http://www.learnyouahaskell.com/"
# URL = "http://www.carameltechstudios.com/"
URL = input("Enter a URL inclusive of all http or https tags\n")
counter = 0
redlist.append(URL)
sites= crawler(URL,URL,redlist)
print(len(sites))
main()