美文网首页
数据清洗

数据清洗

作者: 月夜星空下 | 来源:发表于2020-07-13 10:04 被阅读0次
import re
import os
import csv
import time
import codecs
import random
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
Inpath = input("请输入待清洗文本路径:")
if os.path.exists('/Users/lilong/Desktop/Data_cleaning') is False:
    dir_name = os.makedirs('/Users/lilong/Desktop/Data_cleaning')
name = input("请输入清洗后文本名称:")
desktop_path = '/Users/lilong/Desktop/Data_cleaning/'
The_custom_path = desktop_path + name + '.csv'
file_w = open(The_custom_path, 'w')
file_w.write(" ")
file = open(Inpath, 'r')
file_r = open(The_custom_path, 'r')
def stopwordslist():
    stopwords = [line.strip() for line in open('/Users/lilong/Desktop/stop_words', encoding='UTF-8').readlines()]
    return stopwords

content = ''
for i in file:
    f = i.replace('  ', '\r\n')
    f = f.replace(' ', '\r\n')
    f = f.replace('阿卡索', '立刻说')
    # f = f.replace('    ', '')
    # f = f.replace('    ', '')
    # f = f.replace('  ', '')
    # f = f.replace('   ', '')
    # f = f.replace('    ', '')
    # f = f.replace('        ', '')
    f = f.replace('2019', '2020')
    f = f.replace('2018', '2020')
    f = f.replace('00年', '20年')
    f = f.replace('1,', '')
    f = f.replace('2,', '')
    f = f.replace('3,', '')
    f = f.replace('4,', '')
    f = f.replace('5,', '')
    f = f.replace('6,', '')
    f = f.replace('7,', '')
    f = f.replace('8,', '')
    f = f.replace('1.', '')
    f = f.replace('2.', '')
    f = f.replace('3.', '')
    f = f.replace('4.', '')
    f = f.replace('5.', '')
    f = f.replace('6.', '')
    f = f.replace('7.', '')
    f = f.replace('8.', '')
    f = f.replace('9.', '')
    f = f.replace('(1)', '')
    f = f.replace('(2)', '')
    f = f.replace('(3)', '')
    f = f.replace('(4)', '')
    f = f.replace('(5)', '')
    f = f.replace('(6)', '')
    f = f.replace('(7)', '')
    f = f.replace('(8)', '')
    f = f.replace('1:', '')
    f = f.replace('2:', '')
    f = f.replace('3:', '')
    f = f.replace('4:', '')
    f = f.replace('5:', '')
    f = f.replace('6:', '')
    f = f.replace('7:', '')
    f = f.replace('8:', '')
    f = f.replace('1、', '')
    f = f.replace('2、', '')
    f = f.replace('3、', '')
    f = f.replace('4、', '')
    f = f.replace('5、', '')
    f = f.replace('6、', '')
    f = f.replace('7、', '')
    f = f.replace('8、', '')
    f = f.replace('一、', '')
    f = f.replace('二、', '')
    f = f.replace('三、', '')
    f = f.replace('四、', '')
    f = f.replace('五、', '')
    f = f.replace('六、', '')
    f = f.replace('七、', '')
    f = f.replace('八、', '')
    f = f.replace('一.', '')
    f = f.replace('二.', '')
    f = f.replace('三.', '')
    f = f.replace('四.', '')
    f = f.replace('五.', '')
    f = f.replace('六.', '')
    f = f.replace('七.', '')
    f = f.replace('八.', '')
    f = f.replace('a.', '')
    f = f.replace('b.', '')
    f = f.replace('c.', '')
    f = f.replace('d.', '')
    f = f.replace('.。', '。')
    f = f.replace('。 ', ',')
    f = f.replace('。。', '。')
    f = f.replace('?。', '。')
    f = f.replace('。,', '。')
    f = f.replace('。。', '。')
    f = f.replace(',。', '。')
    f = f.replace(',', ',,')
    f = f.replace('""', '')
    f = f.replace('。\n,', '。')
    f = f.replace('。\n。', '。')
    f = f.replace(',\n。', '。')
    f = f.replace('1)', '')
    f = f.replace('2)', '')
    f = f.replace('3)', '')
    f = f.replace('4)', '')
    f = f.replace('5)', '')
    f = f.replace('6)', '')
    f = f.replace('7)', '')
    f = f.replace('8)', '')
    f = f.replace('第一点', '')
    f = f.replace('第二点', '')
    f = f.replace('第三点', '')
    f = f.replace('第四点', '')
    f = f.replace('的第一步是', '')
    f = f.replace('的第二步是', '')
    f = f.replace('的第三步是', '')
    f = f.replace('的第四步是', '')
    f = f.replace('一。', '')
    f = f.replace('二。', '')
    f = f.replace('三。', '')
    f = f.replace('四。', '')
    f = f.replace('五。', '')
    f = f.replace('六。', '')
    f = f.replace('七。', '')
    f = f.replace('八。', '')
    f = f.replace('第一', '')
    f = f.replace('第二', '')
    f = f.replace('第三', '')
    f = f.replace('第四', '')
    regex = 'com' or 'www' or 'text-align' or '元'
    p_string = f.split(',' or '。' or ':' or ',')
    co = ''
    for line1 in p_string:
        # line2 = line1.replace(" ", "")
        # line1 = line1.strip()
        line1 = line1 + ','  # 恢复逗号
        if re.search(regex, line1) is None:
            co += line1
    content += co
def write(content):
    csv = os.path.join(The_custom_path)
    w_txt = open(csv, 'w')
    f = codecs.open(csv, 'r+', encoding='utf-8')
    f.write(content)
    f.close()
    return content

write(content)
try:
    con = ''
    while True:
        text_line = file_r.readline()
        if text_line:
            if len(text_line) > 150 and len(text_line) < 250:
                con += text_line
        else:
            break
finally:

    file.close()
con = con.replace(',,,,', ',')
con = con.replace(',,,', ',')
conn = con.replace(',,', ',')
connn = conn.replace('    ', '')

write(connn)

df = pd.read_csv(The_custom_path, sep=' ')
df1 = df.drop_duplicates()
df1.columns = ['"ook"']

cleaning_result = df1.to_csv(The_custom_path, index=0)


all_content = ''
with open(The_custom_path)as f3:
    cNames = f3.readlines()
    for i in range(0, len(cNames)):
        cNames[i] = cNames[i].lstrip()
        cNames[i] = cNames[i].strip()
        # print(type(cNames[i]))
        cNames[i] = cNames[i].lstrip(',')  # 去除每段句首的符号
        cNames[i] = cNames[i].lstrip(',')
        cNames[i] = cNames[i].lstrip(':')
        cNames[i] = cNames[i].rstrip()   # 去除每段句末的空格
        cNames[i] = '{b}{a}{c}{b}'.format(b='"', c='。', a=cNames[i]) + '\r\n'  # 加引号
        # cNames[i] = '{c}{a}'.format(c='。',a = cNames[i]) + '\r\n'  # 不加引号
        all_content += cNames[i]
with open(The_custom_path, 'w') as f4:
    f4.writelines(cNames)
write(all_content)


with open(The_custom_path, 'r') as file_r:
    ct = ''
    for i in file_r:
        ct += i
    cg = ct.replace('""', "")
    cg = cg.replace('。。', "。")
    cg = cg.replace('。。。', "。")
    # cg = cg.replace('。', ",")
    cg = cg.replace('以下', "这些")
    cg = cg.replace('其次', "")
    # cg = cg.replace(',\n,"', "。\"\n")
    # cg = cg.replace(',\n,"', "。\"\n")
    # cg = cg.replace(':\n,"', "。\"\n")
    # cg = cg.replace('\n,"', "。\"\n")
    cg = cg.replace('.。', "。")
    # cg = cg.replace(' ', "")
    # cg = cg.replace('  ', "")
    # cg = cg.replace('   ', "")
    # cg = cg.replace('    ', "")
    # cg = cg.replace('     ', "")
    # cg = cg.replace('      ', "")
    # cg = cg.replace('        ', "")
    # cg = cg.replace('                ', "")
    cg = cg.replace(',\n。"', "。\"\n")
    cg = cg.replace(',\n。"', "。\"\n")
    cg = cg.replace(':\n。"', "。\"\n")
    cg = cg.replace('\n。"', "。\"\n")
    cg = cg.replace(',"。"', "。\"")
    cg = cg.replace('。"。"', "。\"")
    cg = cg.replace(':"。"', "。\"")
    cg = cg.replace('。。"', "。\"")
    cg = cg.replace('。""', "。\"")
    cg = cg.replace('。    。', "。")
    cg = cg.replace(" ", "")
    cg = cg.replace(' 。', "。")
    cg = cg.replace(',。', "。")
    cg = cg.replace(';,', "。")
    cg = cg.replace(':(。"', "。")
    cg = cg.replace('。(。', "。\"")
    cg = cg.replace('",', "\"")
    cg = cg.replace(',,', ",")
    cg = cg.replace('①', "")
    cg = cg.replace(':。', "。")
    cg = cg.replace('1', '')
    cg = cg.replace('2', '')
    cg = cg.replace('3', '')
    cg = cg.replace('4', '')
    cg = cg.replace('5', '')
    cg = cg.replace('?。', '?')
    cg = cg.replace(':,', ',')
    cg = cg.replace(',,', ',')
    cg = cg.replace('!important;text-align:left;}', '')
    cg = cg.replace('"“', '"')
    cg = cg.replace('。”。"', '。"')
    cg = cg.replace('?。', '?')
    cg = cg.replace('!。', '?')

# print(cg)
write(cg)
file_r = open(The_custom_path, 'r')
try:
    con1 = ''
    while True:
        text_line = file_r.readline()
        text_line = text_line.replace('。。', "。")
        if text_line:
            if len(text_line) > 2:
                con1 += text_line
        else:
            break
finally:
    file.close()
write(con1)

print("ok")


# con = ''
# with open(The_custom_path)as file_r:
#     try:
#         con = ''
#
#         while True:
#
#             text_line = file_r.readline()
#             if text_line:
#                 if len(text_line) > 150 and len(text_line) < 250:
#                     # text_line = text_line.replace("\n", "")
#                     con += text_line
#                     # print(con)
#             else:
#                 break
#         # print(con)
#     finally:
#         file_r.close()
# write(con)


# df2 = pd.read_csv(The_custom_path, sep=' ')
# df3 = df2.drop_duplicates()
# df2.columns = ['"content"']
# cleaning_result = df2.to_csv(The_custom_path, index=0)
# print("ok")

# ctt = ''
# for i in file_r:
#     regex = 'com' or 'www'
#     p_string = i.split(':')
#     coo = ''
#     for line1 in p_string:
#         line1 = line1.replace(" ", "")
#         if re.search(regex, line1) is None:
#             coo += line1
#         coo += line1
#     ctt += coo

# df = pd.read_csv(The_custom_path, sep=' ')
# print("ok")
# df1 = df.drop_duplicates()
# df1.columns = ['"content"']
# cleaning_result = df1.to_csv(The_custom_path, index=0)


相关文章

  • 第三章-数据预处理

    数据预处理的主要内容包括数据清洗、数据集成、数据变换和数据规约。 3.1数据清洗 数据清洗主要是删除原始数据集中的...

  • 2019-09-14 分析lianjia数据(四)——Power

    分析lianjia房源数据(一)——Python数据清洗 分析lianjia房源数据(二)——SPSS数据清洗 分...

  • 2019-10-03 分析lianjia数据(五)——生成词云图

    分析lianjia房源数据(一)——Python数据清洗 分析lianjia房源数据(二)——SPSS数据清洗 分...

  • 2019-08-19 分析lianjia数据(三)——SPSS数

    前置内容——lianjia数据清洗 分析lianjia房源数据(一)——Python数据清洗 分析lianjia房...

  • 数据清洗的步骤是什么(上)

    数据清洗工作是数据分析工作中不可缺少的步骤,这是因为数据清洗能够处理掉肮脏数据,如果不清洗数据的话,那么数据分析的...

  • 机器学习-数据清洗

    本文由brzhang发表 数据清洗 首先,为何需要对数据进行清洗 数据清洗的工作绝壁是非常枯燥的,做数据研究的的人...

  • 数据清洗

    从两个角度上看,数据清洗一是为了解决数据质量问题,二是让数据更适合做挖掘。不同的目的下分不同的情况,也都有相应的解...

  • 数据清洗

    数据清洗 重复数据处理(推荐使用顺序) 数据透视表可统计数据重复次数和重复数据 选中A、B两列,点击插入选项卡-数...

  • 数据清洗

  • 数据清洗

    数据清洗是指对提供的原始数据进行一定的加工,使得其方便后续的特征抽取。其与特征抽取的界限有时也没有那么明确。常用的...

网友评论

      本文标题:数据清洗

      本文链接:https://www.haomeiwen.com/subject/qwzocktx.html