美文网首页
26 Pandas处理分析网站原始访问日志

26 Pandas处理分析网站原始访问日志

作者: Viterbi | 来源:发表于2022-11-15 19:20 被阅读0次

26 Pandas处理分析网站原始访问日志

目标:真实项目的实战,探索Pandas的数据处理与分析

实例: 数据来源:我自己的wordpress博客http://www.crazyant.net/ 的访问日志

实现步骤:
1、读取数据、清理、格式化
2、统计爬虫spider的访问比例,输出柱状图
3、统计http状态码的访问占比,输出饼图 4、统计按小时、按天的PV/UV流量趋势,输出折线图

1、读取数据并清理格式化

import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', -1)

from pyecharts import options as opts
from pyecharts.charts import Bar,Pie,Line

# 读取整个目录,将所有的文件合并到一个dataframe
data_dir = "./datas/crazyant/blog_access_log"

df_list = []

import os
for fname in os.listdir(f"{data_dir}"):
    df_list.append(pd.read_csv(f"{data_dir}/{fname}", sep=" ", header=None, error_bad_lines=False))

df = pd.concat(df_list)

    b'Skipping line 2245: expected 10 fields, saw 16\nSkipping line 2889: expected 10 fields, saw 14\nSkipping line 2890: expected 10 fields, saw 14\nSkipping line 2891: expected 10 fields, saw 13\nSkipping line 2892: expected 10 fields, saw 13\nSkipping line 2900: expected 10 fields, saw 11\nSkipping line 2902: expected 10 fields, saw 11\nSkipping line 3790: expected 10 fields, saw 14\nSkipping line 3791: expected 10 fields, saw 14\nSkipping line 3792: expected 10 fields, saw 13\nSkipping line 3793: expected 10 fields, saw 13\nSkipping line 3833: expected 10 fields, saw 11\nSkipping line 3835: expected 10 fields, saw 11\nSkipping line 9936: expected 10 fields, saw 16\n'
    b'Skipping line 11748: expected 10 fields, saw 11\nSkipping line 11750: expected 10 fields, saw 11\n'
    



df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
0 1 2 3 4 5 6 7 8 9
0 106.11.153.226 - - [02/Dec/2019:22:40:18 +0800] GET /740.html?replytocom=1194 HTTP/1.0 200 13446 - YisouSpider
1 42.156.254.60 - - [02/Dec/2019:22:40:23 +0800] POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.0 201 55 http://www.crazyant.net/740.html?replytocom=1194 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
2 106.11.159.254 - - [02/Dec/2019:22:40:27 +0800] GET /576.html HTTP/1.0 200 13461 - YisouSpider
3 106.11.157.254 - - [02/Dec/2019:22:40:28 +0800] GET /?lwfcdw=t9n2d3&oqzohc=m5e7j1&oubyvq=iab6a3&oudmbg=6osqd3 HTTP/1.0 200 10485 - YisouSpider
4 42.156.137.109 - - [02/Dec/2019:22:40:30 +0800] POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.0 201 55 http://www.crazyant.net/576.html Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
df = df[[0, 3, 6, 9]].copy()
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
0 3 6 9
0 106.11.153.226 [02/Dec/2019:22:40:18 200 YisouSpider
1 42.156.254.60 [02/Dec/2019:22:40:23 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
2 106.11.159.254 [02/Dec/2019:22:40:27 200 YisouSpider
3 106.11.157.254 [02/Dec/2019:22:40:28 200 YisouSpider
4 42.156.137.109 [02/Dec/2019:22:40:30 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
df.columns = ["ip", "stime", "status", "client"]
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
ip stime status client
0 106.11.153.226 [02/Dec/2019:22:40:18 200 YisouSpider
1 42.156.254.60 [02/Dec/2019:22:40:23 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
2 106.11.159.254 [02/Dec/2019:22:40:27 200 YisouSpider
3 106.11.157.254 [02/Dec/2019:22:40:28 200 YisouSpider
4 42.156.137.109 [02/Dec/2019:22:40:30 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
df.dtypes




    ip        object
    stime     object
    status    int64 
    client    object
    dtype: object

2、统计spider的比例

df["is_spider"] = df["client"].str.lower().str.contains("spider")
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
ip stime status client is_spider
0 106.11.153.226 [02/Dec/2019:22:40:18 200 YisouSpider True
1 42.156.254.60 [02/Dec/2019:22:40:23 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
2 106.11.159.254 [02/Dec/2019:22:40:27 200 YisouSpider True
3 106.11.157.254 [02/Dec/2019:22:40:28 200 YisouSpider True
4 42.156.137.109 [02/Dec/2019:22:40:30 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
df_spider = df["is_spider"].value_counts()
df_spider



    False    46641
    True     3637 
    Name: is_spider, dtype: int64




bar = (
        Bar()
        .add_xaxis([str(x) for x in df_spider.index])
        .add_yaxis("是否Spider", df_spider.values.tolist())
        .set_global_opts(title_opts=opts.TitleOpts(title="爬虫访问量占比"))
)
bar.render_notebook()

3、访问状态码的数量对比

df_status = df.groupby("status").size()
df_status



    status
    200    41924
    201    3432 
    206    70   
    301    2364 
    302    23   
    304    19   
    400    20   
    403    92   
    404    1474 
    405    12   
    444    846  
    500    1    
    504    1    
    dtype: int64


list(zip(df_status.index, df_status))




    [(200, 41924),
     (201, 3432),
     (206, 70),
     (301, 2364),
     (302, 23),
     (304, 19),
     (400, 20),
     (403, 92),
     (404, 1474),
     (405, 12),
     (444, 846),
     (500, 1),
     (504, 1)]



pie = (
        Pie()
        .add("状态码比例", list(zip(df_status.index, df_status)))
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    )
pie.render_notebook()

4、实现按小时、按天粒度的流量统计

df.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
ip stime status client is_spider
0 106.11.153.226 [02/Dec/2019:22:40:18 200 YisouSpider True
1 42.156.254.60 [02/Dec/2019:22:40:23 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
2 106.11.159.254 [02/Dec/2019:22:40:27 200 YisouSpider True
3 106.11.157.254 [02/Dec/2019:22:40:28 200 YisouSpider True
4 42.156.137.109 [02/Dec/2019:22:40:30 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
df["stime"] = pd.to_datetime(df["stime"].str[1:], format="%d/%b/%Y:%H:%M:%S")
df.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
ip stime status client is_spider
0 106.11.153.226 2019-12-02 22:40:18 200 YisouSpider True
1 42.156.254.60 2019-12-02 22:40:23 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
2 106.11.159.254 2019-12-02 22:40:27 200 YisouSpider True
3 106.11.157.254 2019-12-02 22:40:28 200 YisouSpider True
4 42.156.137.109 2019-12-02 22:40:30 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
df.set_index("stime", inplace=True)
df.sort_index(inplace=True)
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
ip status client is_spider
stime
2019-12-02 22:40:18 106.11.153.226 200 YisouSpider True
2019-12-02 22:40:23 42.156.254.60 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
2019-12-02 22:40:27 106.11.159.254 200 YisouSpider True
2019-12-02 22:40:28 106.11.157.254 200 YisouSpider True
2019-12-02 22:40:30 42.156.137.109 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
df.index



    DatetimeIndex(['2019-12-02 22:40:18', '2019-12-02 22:40:23',
                   '2019-12-02 22:40:27', '2019-12-02 22:40:28',
                   '2019-12-02 22:40:30', '2019-12-02 22:40:46',
                   '2019-12-02 22:41:52', '2019-12-02 22:41:52',
                   '2019-12-02 22:41:55', '2019-12-02 22:42:16',
                   ...
                   '2019-12-07 21:30:16', '2019-12-07 21:30:17',
                   '2019-12-07 21:30:19', '2019-12-07 21:30:20',
                   '2019-12-07 21:30:21', '2019-12-07 21:30:22',
                   '2019-12-07 21:30:23', '2019-12-07 21:30:56',
                   '2019-12-07 21:30:58', '2019-12-07 21:31:02'],
                  dtype='datetime64[ns]', name='stime', length=50278, freq=None)



# 按小时统计
#df_pvuv = df.resample("H")["ip"].agg(pv=np.size, uv=pd.Series.nunique)

# 按每6个小时统计
#df_pvuv = df.resample("6H")["ip"].agg(pv=np.size, uv=pd.Series.nunique)

# 按天统计
df_pvuv = df.resample("D")["ip"].agg(pv=np.size, uv=pd.Series.nunique)

df_pvuv.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
pv uv
stime
2019-12-02 288 70
2019-12-03 10285 1180
2019-12-04 13618 1197
2019-12-05 10485 1152
2019-12-06 9469 1261
line = (
        Line()
        .add_xaxis(df_pvuv.index.to_list())
        .add_yaxis("PV", df_pvuv["pv"].to_list())
        .add_yaxis("UV", df_pvuv["uv"].to_list())
        .set_global_opts(
            title_opts=opts.TitleOpts(title="PVUV数据对比"),
            tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross")
        )
    )
line.render_notebook()

本文使用 文章同步助手 同步

相关文章

网友评论

      本文标题:26 Pandas处理分析网站原始访问日志

      本文链接:https://www.haomeiwen.com/subject/tatjtdtx.html