Python 抓取电视猫官网数据制作 EPG (二)

1. 封装获取单个节目的epg成函数,保存成文件tvmao.py

#!/usr/bin/python
# coding: utf-8
# by 黑鸟博客
 
import urllib3
import requests
import datetime
import time
import base64
import ssl
import json
#import http.cookiejar
from bs4 import BeautifulSoup  
 
 
def is_valid_date(strdate):   
	try:  
		if ":" in strdate:  
			time.strptime(strdate, "%H:%M")  
		else:  
			return False  
		return True  
	except:  
		return False  
		
 
def sub_req(a, q, id):
 
	_keyStr = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
	
	
	str1 = "|"+q;
	v = base64.b64encode(str1.encode('utf-8'));
	
	str2 = id+"|"+a;
	w = base64.b64encode(str2.encode('utf-8'));
	
	str3 = time.strftime("%w");
	wday = (7 if(int(str3) == 0) else int(str3));
	#print(wday);
	F = _keyStr[wday*wday];
	
	
	return (F+str(w,'utf-8')+str(v,'utf-8'));
	
def get_program_info(link, sublink, week_day, epg_file_name):
	
	with open(epg_file_name, "a+") as f:
	
		str3 = time.strftime("%Y/%m/%d %A",time.localtime(time.time()+(week_day-int(time.strftime("%w")))*24*3600) );
		#str3 = datetime.date.today()+datetime.timedelta(days = (1-int(time.strftime("%w"))))
		
		f.write(str3)
		f.write("\n\n")
	
	f.close()
 
	headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0',
			'Connection' : 'keep-alive', 'Cache-Control': 'no-cache'} 
	website = '%s%s' % (link, sublink)
	r = requests.get(website, headers= headers)
		
	soup = BeautifulSoup(r.text, 'lxml')   
	# 获取节目列表,https://www.guihet.com/
	list_program_div = soup.find(name='div',attrs={"class":"epg"}).find_all(name='span');
	with open(epg_file_name, "a+") as f:
		for tagprogram in list_program_div:
			#print(tagprogram)
			try:
				if is_valid_date(tagprogram.text):
					
					f.write(tagprogram.text)
					f.write("	")
				else:
					if tagprogram.text != '正在播出':
						f.write(tagprogram.text)
						f.write("\n")
						
			except:
				continue		
	f.close()
	
	list_first_form = soup.find(name='form');
	sublink = "/api/pg?p="+sub_req(list_first_form["a"], list_first_form["q"], list_first_form.button["id"]);
 
	website = '%s%s' % (link, sublink);
	sub_r = requests.get(website);
 
	soup = BeautifulSoup(sub_r.json()[1], 'lxml')   
	list_program_div = soup.find_all(name='span');
 
	with open(epg_file_name, "a+") as f:
		for tagprogram in list_program_div:
			
			try:
				if is_valid_date(tagprogram.text):
					f.write(tagprogram.text)
					f.write("	")
				else:
					if tagprogram.text != '正在播出':
						f.write(tagprogram.text)
						f.write("\n")
				
			except:
				continue
		f.write("\n\n")
	f.close()
 
 
def get_program(link, sublink, week_day, epg_file_name):
	
	get_program_info(link, sublink, week_day, epg_file_name);
	

2. 调用 tvmao 封装的函数,实现多节目epg 获取。

#!/usr/bin/python
# coding: utf-8
# by 黑鸟博客
 
import os
import tvmao
 
link = "https://www.tvmao.com"
 
#中央 , 名字在网址里复制,https://www.guihet.com/
CCTV_prog = ['CCTV1', 'CCTV2', 'CCTV3', 'CCTV4', 'CCTV5', 'CCTV6']	
 
epg_path = 'epg/cctv/';
if not os.path.exists(epg_path):
	os.makedirs(epg_path)
	
for prog in CCTV_prog:
	epg_name = epg_path+prog+'.txt';
	with open(epg_name, "w+") as f:
		f.write("")
	f.close()
	print(prog)
	for num in range(1, 8):
		sublink = "/program/CCTV-"+prog+"-w"+str(num)+".html";
		tvmao.get_program(link, sublink, num, epg_name);
 
		
		
#省台 , 名字在网址里复制,https://www.guihet.com/
province_prog = ['AHTV1', 'BTV1', 'CCQTV1', 'FJTV2', 'XMTV5', 'HUNANTV1']	
 
epg_path = 'epg/province/';
if not os.path.exists(epg_path):
	os.makedirs(epg_path)
for prog in province_prog:
	epg_name = epg_path+prog+'.txt';
	with open(epg_name, "w+") as f:
		f.write("")
	f.close()
	print(prog)
	for num in range(1, 8):
		sublink = "/program_satellite/"+prog+"-w"+str(num)+".html";
		tvmao.get_program(link, sublink, num, epg_name);
 

剩下的就是根据需求对 EPG 文件的处理了。

说明: 当期的节目路径是手动从网页上复制的, 需要哪个节目,添加到列表中。需要人工确认路径正确性。

下一步要完成的是实现从网页获取节目的路径, 实现全自动化。

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注

  1. 沧海桑田 沧海桑田 说道:

    算法好像变了,能更新一下吗

  2. qdshen 说道:

    寻求 能解析国外直播电视网的高手 有偿解析 qq1376214911 http://onair.kbs.co.kr/

  3. 子午书屋 子午书屋 说道:

    老实说,电视猫很不错