init python instances (9e2b5acc) · Commits · Wenchao Zhang / Python-Instances

Douban/douban.ipynb

0 → 100644

+169 −0

Original line number	Diff line number	Diff line
		%% Cell type:markdown id: tags:

		# 提取豆瓣电影Top250

		%% Cell type:code id: tags:

		``` python
		#coding:utf-8
		# 文件导出版本
		'''''
		@author: jsjxy
		@origin: https://www.cnblogs.com/xisheng/p/9130156.html
		'''
		from urllib import request
		from bs4 import BeautifulSoup
		import re
		#from distutils.filelist import findall

		def doubantop(N):
		with open('./douban-top-%s.txt'%str(N),'w') as f:
		f.write("豆瓣电影TOP"+str(N)+'\n')
		f.write('%-13s%-5s%-15s%-40s\n'%('影片名','评分','评价人数','链接'))
		for n in range(N//25):
		url = 'https://movie.douban.com/top250?start='+str(25*n)+'&filter='
		page = request.urlopen(url)
		contents = page.read()
		#print(contents)
		soup = BeautifulSoup(contents,"html.parser")

		for tag in soup.find_all('div', class_='info'):
		# print tag
		m_name = tag.find('span', class_='title').get_text()
		m_rating_score = float(tag.find('span',class_='rating_num').get_text())
		m_people = tag.find('div',class_="star")
		m_span = m_people.findAll('span')
		m_peoplecount = re.sub(r'[^0-9]','',m_span[3].contents[0])
		# 剔除了`人评价`字样, 只保留人数数字
		m_url=tag.find('a').get('href')
		f.write('%s\t%s\t%s\t%s\n'% (m_name,str(m_rating_score), m_peoplecount ,m_url) )
		n=n+1
		print('请查看文件douban-top-%s!' %str(N))

		num = input("你想要看豆瓣电影TOP榜单的前几名(小于等于250且最好是25的倍数):")
		num = int(num)
		doubantop(num)
		```

		%% Output

		你想要看豆瓣电影TOP榜单的前几名(小于等于250且最好是25的倍数):100
		请查看文件douban-top-100!

		%% Cell type:code id: tags:

		``` python
		# print 查看版本
		#coding:utf-8
		'''''
		@author: jsjxy
		@origin: https://www.cnblogs.com/xisheng/p/9130156.html
		'''
		from urllib import request
		from bs4 import BeautifulSoup
		import re
		#from distutils.filelist import findall

		all_list = []
		for n in range(10):
		url = 'https://movie.douban.com/top250?start='+str(25*n)+'&filter='
		page = request.urlopen(url)
		contents = page.read()
		#print(contents)
		soup = BeautifulSoup(contents,"html.parser")
		for tag in soup.find_all('div', class_='info'):
		# print tag
		m_name = tag.find('span', class_='title').get_text()
		m_rating_score = float(tag.find('span',class_='rating_num').get_text())
		m_people = tag.find('div',class_="star")
		m_span = m_people.findAll('span')
		m_peoplecount = re.sub(r'[^0-9]','',m_span[3].contents[0])
		# 剔除了`人评价`字样, 只保留人数数字
		m_url=tag.find('a').get('href')
		all_list.append('%s\t%s\t%s\t%s\n'% (m_name,str(m_rating_score), m_peoplecount ,m_url))



		def doubantop(N,n=1):
		print("豆瓣电影TOP "+str(n)+' - '+str(N)+'\n'+'%-13s%-5s%-15s%-40s\n'%('影片名','评分','评价人数','链接'))
		for movie in all_list[n-1:N]:
		print(movie)



		#num = input("你想要看豆瓣电影TOP250榜单的范围")
		#num = int(num)
		doubantop(30)

		```

		%% Output

		豆瓣电影TOP 1 - 30
		影片名评分评价人数链接

		肖申克的救赎 9.6 1288574 https://movie.douban.com/subject/1292052/

		霸王别姬 9.6 951218 https://movie.douban.com/subject/1291546/

		这个杀手不太冷 9.4 1182952 https://movie.douban.com/subject/1295644/

		阿甘正传 9.4 1015734 https://movie.douban.com/subject/1292720/

		美丽人生 9.5 594513 https://movie.douban.com/subject/1292063/

		泰坦尼克号 9.3 951937 https://movie.douban.com/subject/1292722/

		千与千寻 9.3 944451 https://movie.douban.com/subject/1291561/

		辛德勒的名单 9.5 532059 https://movie.douban.com/subject/1295124/

		盗梦空间 9.3 1026845 https://movie.douban.com/subject/3541415/

		机器人总动员 9.3 683063 https://movie.douban.com/subject/2131459/

		忠犬八公的故事 9.3 671461 https://movie.douban.com/subject/3011091/

		三傻大闹宝莱坞 9.2 921705 https://movie.douban.com/subject/3793023/

		海上钢琴师 9.2 765462 https://movie.douban.com/subject/1292001/

		放牛班的春天 9.3 635757 https://movie.douban.com/subject/1291549/

		大话西游之大圣娶亲 9.2 707374 https://movie.douban.com/subject/1292213/

		楚门的世界 9.2 691053 https://movie.douban.com/subject/1292064/

		龙猫 9.2 623627 https://movie.douban.com/subject/1291560/

		星际穿越 9.2 701880 https://movie.douban.com/subject/1889243/

		教父 9.2 465133 https://movie.douban.com/subject/1291841/

		熔炉 9.3 403040 https://movie.douban.com/subject/5912992/

		无间道 9.1 581480 https://movie.douban.com/subject/1307914/

		当幸福来敲门 9.0 745379 https://movie.douban.com/subject/1849031/

		疯狂动物城 9.2 781539 https://movie.douban.com/subject/25662329/

		触不可及 9.2 491584 https://movie.douban.com/subject/6786002/

		怦然心动 9.0 810776 https://movie.douban.com/subject/3319755/

		乱世佳人 9.2 349486 https://movie.douban.com/subject/1300267/

		蝙蝠侠：黑暗骑士 9.1 469114 https://movie.douban.com/subject/1851857/

		活着 9.2 382700 https://movie.douban.com/subject/1292365/

		天堂电影院 9.1 380814 https://movie.douban.com/subject/1291828/

		少年派的奇幻漂流 9.0 755425 https://movie.douban.com/subject/1929463/


		%% Cell type:code id: tags:

		``` python
		```

Douban/douban.py

0 → 100644

+92 −0

Original line number	Diff line number	Diff line
		# -- coding: utf-8 --
		"""douban.ipynb

		Automatically generated by Colaboratory.

		Original file is located at
		https://colab.research.google.com/drive/1qg0hwEsZ4IjUxPMAlKT0DwKyUgnjc6Zx

		# 提取豆瓣电影Top250
		"""

		#coding:utf-8
		# 文件导出版本
		'''''
		@author: jsjxy
		@origin: https://www.cnblogs.com/xisheng/p/9130156.html
		'''
		from urllib import request
		from bs4 import BeautifulSoup
		import re
		#from distutils.filelist import findall

		def doubantop(N):
		with open('./douban-top-%s.txt'%str(N),'w') as f:
		f.write("豆瓣电影TOP"+str(N)+'\n')
		f.write('%-13s%-5s%-15s%-40s\n'%('影片名','评分','评价人数','链接'))
		for n in range(N//25):
		url = 'https://movie.douban.com/top250?start='+str(25*n)+'&filter='
		page = request.urlopen(url)
		contents = page.read()
		#print(contents)
		soup = BeautifulSoup(contents,"html.parser")

		for tag in soup.find_all('div', class_='info'):
		# print tag
		m_name = tag.find('span', class_='title').get_text()
		m_rating_score = float(tag.find('span',class_='rating_num').get_text())
		m_people = tag.find('div',class_="star")
		m_span = m_people.findAll('span')
		m_peoplecount = re.sub(r'[^0-9]','',m_span[3].contents[0])
		# 剔除了`人评价`字样, 只保留人数数字
		m_url=tag.find('a').get('href')
		f.write('%s\t%s\t%s\t%s\n'% (m_name,str(m_rating_score), m_peoplecount ,m_url) )
		n=n+1
		print('请查看文件douban-top-%s!' %str(N))

		num = input("你想要看豆瓣电影TOP榜单的前几名(小于等于250且最好是25的倍数):")
		num = int(num)
		doubantop(num)

		# print 查看版本
		#coding:utf-8
		'''''
		@author: jsjxy
		@origin: https://www.cnblogs.com/xisheng/p/9130156.html
		'''
		from urllib import request
		from bs4 import BeautifulSoup
		import re
		#from distutils.filelist import findall

		all_list = []
		for n in range(10):
		url = 'https://movie.douban.com/top250?start='+str(25*n)+'&filter='
		page = request.urlopen(url)
		contents = page.read()
		#print(contents)
		soup = BeautifulSoup(contents,"html.parser")
		for tag in soup.find_all('div', class_='info'):
		# print tag
		m_name = tag.find('span', class_='title').get_text()
		m_rating_score = float(tag.find('span',class_='rating_num').get_text())
		m_people = tag.find('div',class_="star")
		m_span = m_people.findAll('span')
		m_peoplecount = re.sub(r'[^0-9]','',m_span[3].contents[0])
		# 剔除了`人评价`字样, 只保留人数数字
		m_url=tag.find('a').get('href')
		all_list.append('%s\t%s\t%s\t%s\n'% (m_name,str(m_rating_score), m_peoplecount ,m_url))



		def doubantop(N,n=1):
		print("豆瓣电影TOP "+str(n)+' - '+str(N)+'\n'+'%-13s%-5s%-15s%-40s\n'%('影片名','评分','评价人数','链接'))
		for movie in all_list[n-1:N]:
		print(movie)



		#num = input("你想要看豆瓣电影TOP250榜单的范围")
		#num = int(num)
		doubantop(30)

Download/LectureNoteDownload.ipynb

0 → 100644

+76 −0

Original line number	Diff line number	Diff line
		%% Cell type:code id: tags:

		``` python
		from urllib import request
		from bs4 import BeautifulSoup
		import re

		url=r'http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/index.html'
		link=r'http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/'

		# proxy={'http':'http://localhost:80'}
		headers = ("User-Agent"," Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36") #这里模拟浏览器
		opener = request.build_opener()
		opener.addheaders = [headers]
		request.install_opener(opener)
		# 添加 header 模拟浏览器, 可兼容 urlretrieve.

		contents = request.urlopen(url).read().decode()
		soup = BeautifulSoup(contents,"html.parser")
		n=1
		for tag in soup.find_all('a'):
		pdf = tag.get('href')
		pdfurl = link+pdf
		print(pdfurl+"\n")
		pdfdir = 'C:/Users/whzec/Desktop/'+pdf
		request.urlretrieve(pdfurl,pdfdir)
		n=n+1

		# urlretrieve 用来保存文件, py3 在 urllib.request内, py2在urllib
		```

		%% Output

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/syll.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw1.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw2.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw2a.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw3.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw3a.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/hw4.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec1.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec2.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec3.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec4.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec5.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec6.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec7.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec8.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec9.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec10.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Lec11.pdf

		http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/Bernotes.pdf


		%% Cell type:code id: tags:

		``` python
		```

Download/LectureNoteDownload.py

0 → 100644

+36 −0

Original line number	Diff line number	Diff line
		# -- coding: utf-8 --
		"""LectureNoteDownload.ipynb

		Automatically generated by Colaboratory.

		Original file is located at
		https://colab.research.google.com/drive/1aE4KuvnzlfpXyd1fefTsj_jqoHnNkqxy
		"""

		from urllib import request
		from bs4 import BeautifulSoup
		import re

		url=r'http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/index.html'
		link=r'http://u.math.biu.ac.il/~solomyb/TEACH/18/GMT/'

		# proxy={'http':'http://localhost:80'}
		headers = ("User-Agent"," Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36") #这里模拟浏览器
		opener = request.build_opener()
		opener.addheaders = [headers]
		request.install_opener(opener)
		# 添加 header 模拟浏览器, 可兼容 urlretrieve.

		contents = request.urlopen(url).read().decode()
		soup = BeautifulSoup(contents,"html.parser")
		n=1
		for tag in soup.find_all('a'):
		pdf = tag.get('href')
		pdfurl = link+pdf
		print(pdfurl+"\n")
		pdfdir = 'C:/Users/whzec/Desktop/'+pdf
		request.urlretrieve(pdfurl,pdfdir)
		n=n+1

		# urlretrieve 用来保存文件, py3 在 urllib.request内, py2在urllib

Filesoperation/replacetxt.ipynb

0 → 100644

+25 −0

Original line number	Diff line number	Diff line
		%% Cell type:markdown id: tags:

		# 文件的读取与写入

		首先我们给出一个非常有用的例子, 即文件的替换.

		%% Cell type:code id: tags:

		``` python
		# replace some words by python
		import re
		f=open(r'C:\Users\whzec\Desktop\1.md','r') #加上r是为了能使用反斜杠的地址,否则使用斜杠,或者双反斜杠.
		alllines=f.readlines()
		f.close()
		f=open(r'C:\Users\whzec\Desktop\1.md','w+')
		for eachline in alllines:
		a=re.sub('List','cctvcctv',eachline)
		f.writelines(a)
		f.close()
		```

		%% Cell type:code id: tags:

		``` python
		```

Admin message