一、数据来源
数据来自四姑娘山景区首页新闻的每日客流量发布处,利用python爬虫读取2015年9月29号到2020年6月8日的每日客流量和对应的日期。
import urllib.requestfrom bs4 import BeautifulSoupresponse = urllib.request.urlopen('https://www.sgns.cn/news/number')soup = BeautifulSoup(response,'html.parser')numbers0 = soup.find_all(attrs={'headers' : 'categorylist_header_title'})times0 = soup.find_all(attrs={'headers' : 'categorylist_header_date'})numbers1=[]times1=[]for i in numbers0:n=str(i.text)[21:-10]numbers1.append(n)for i in times0:t=str(i.text)[8:-6]times1.append(t)page=166for i in range(1,page-1):response = urllib.request.urlopen('https://www.sgns.cn/news/number?start='+str(10*i))soup = BeautifulSoup(response,'html.parser')numbers0 = soup.find_all(attrs={'headers' : 'categorylist_header_title'})times0 = soup.find_all(attrs={'headers' : 'categorylist_header_date'})for i in numbers0:n=str(i.text)numbers1.append(n)for i in times0:t=str(i.text)times1.append(t)n = len(numbers1)with open('sgns.txt','w') as f: for i in range(n):f.write(numbers1[i]+','+times1[i]+'\n')接下来分析用R语言
library(lubridate) ## ## Attaching package: 'lubridate' ## The following objects are masked from 'package:base':## ## date, intersect, setdiff, union library(ggplot2)data0 = read.csv(file = "C:/Users/91333/Documents/semester6/VS code/VScode Python/dgns1.txt",header = FALSE,sep = ",")#接下来进行数据清洗,把数据整理成可用形态data