当先锋百科网

首页 1 2 3 4 5 6 7

现有一个csv文件,包含'CNUM'和'COMPANY'两列,数据里包含空行,且有内容重复的行数据。

要求:

1)去掉空行;

2)重复行数据只保留一行有效数据;

3)修改'COMPANY'列的名称为'Company_New‘;

4)并在其后增加六列,分别为'C_col',‘D_col',‘E_col',‘F_col',‘G_col',‘H_col'。

一,使用pythonPandas来处理:importpandasaspd

importnumpyasnp

frompandasimportDataFrame,Series

defdeal_with_data(filepath,newpath):

file_obj=open(filepath)

df=pd.read_csv(file_obj)#读取csv文件,创建DataFrame

df=df.reindex(columns=['CNUM','COMPANY','C_col','D_col','E_col','F_col','G_col','H_col'],fill_value=None)#重新指定列索引

df.rename(columns={'COMPANY':'Company_New'},inplace=True)#修改列名

df=df.dropna(axis=0,how='all')#去除NAN即文件中的空行

df['CNUM']=df['CNUM'].astype('int32')#将CNUM列的数据类型指定为int32

df=df.drop_duplicates(subset=['CNUM','Company_New'],keep='first')#去除重复行

df.to_csv(newpath,index=False,encoding='GBK')

file_obj.close()

if__name__=='__main__':

file_path=r'C:\Users\12078\Desktop\python\CNUM_COMPANY.csv'

file_save_path=r'C:\Users\12078\Desktop\python\CNUM_COMPANY_OUTPUT.csv'

deal_with_data(file_path,file_save_path)

二,使用VBA来处理:OptionBase1

OptionExplicit

Submain()

OnErrorGoToerror_handling

DimwbAsWorkbook

Dimwb_outAsWorkbook

DimshtAsWorksheet

Dimsht_outAsWorksheet

DimrngAsRange

DimusedrowsAsByte

Dimusedrows_outAsByte

Dimdict_cnum_companyAsObject

Dimstr_file_pathAsString

Dimstr_new_file_pathAsString

'assignvaluestovariables:

str_file_path="C:\Users\12078\Desktop\Python\CNUM_COMPANY.csv"

str_new_file_path="C:\Users\12078\Desktop\Python\CNUM_COMPANY_OUTPUT.csv"

Setwb=checkAndAttachWorkbook(str_file_path)

Setsht=wb.Worksheets("CNUM_COMPANY")

Setwb_out=Workbooks.Add

wb_out.SaveAsstr_new_file_path,xlCSV'createacsvfile

Setsht_out=wb_out.Worksheets("CNUM_COMPANY_OUTPUT")

Setdict_cnum_company=CreateObject("Scripting.Dictionary")

usedrows=WorksheetFunction.Max(getLastValidRow(sht,"A"),getLastValidRow(sht,"B"))

'renametheheader'COMPANY'to'Company_New',removeblank&duplicatelines/rows.

Dimcnum_companyAsString

cnum_company=""

ForEachrngInsht.Range("A1","A"&usedrows)

IfVBA.Trim(rng.Offset(0,1).Value)="COMPANY"Then

rng.Offset(0,1).Value="Company_New"

EndIf

cnum_company=rng.Value&"-"&rng.Offset(0,1).Value

IfVBA.Trim(cnum_company)<>"-"AndNotdict_cnum_company.Exists(rng.Value&"-"&rng.Offset(0,1).Value)Then

dict_cnum_company.Addrng.Value&"-"&rng.Offset(0,1).Value,""

EndIf

Nextrng

'loopthekeysofdictsplitthekeyesby'-'intocnumarrayandcompanyarray.

Dimindex_dictAsByte

Dimarr_cnum()

Dimarr_Company()

Forindex_dict=0ToUBound(dict_cnum_company.keys)

ReDimPreservearr_cnum(1ToUBound(dict_cnum_company.keys)+1)

ReDimPreservearr_Company(1ToUBound(dict_cnum_company.keys)+1)

arr_cnum(index_dict+1)=Split(dict_cnum_company.keys()(index_dict),"-")(0)

arr_Company(index_dict+1)=Split(dict_cnum_company.keys()(index_dict),"-")(1)

Debug.Printindex_dict

Next

'assignsthevalueofthearraystothecelles.

sht_out.Range("A1","A"&UBound(arr_cnum))=Application.WorksheetFunction.Transpose(arr_cnum)

sht_out.Range("B1","B"&UBound(arr_Company))=Application.WorksheetFunction.Transpose(arr_Company)

'add6columnstooutputcsvfile:

Dimarr_columns()AsVariant

arr_columns=Array("C_col","D_col","E_col","F_col","G_col","H_col")'

sht_out.Range("C1:H1")=arr_columns

CallcheckAndCloseWorkbook(str_file_path,False)

CallcheckAndCloseWorkbook(str_new_file_path,True)

ExitSub

error_handling:

CallcheckAndCloseWorkbook(str_file_path,False)

CallcheckAndCloseWorkbook(str_new_file_path,False)

EndSub

'辅助函数:

'GetlastrowofColumnNinaWorksheet

FunctiongetLastValidRow(in_wsAsWorksheet,in_colAsString)

getLastValidRow=in_ws.Cells(in_ws.Rows.count,in_col).End(xlUp).Row

EndFunction

FunctioncheckAndAttachWorkbook(in_wb_pathAsString)AsWorkbook

DimwbAsWorkbook

DimmywbAsString

mywb=in_wb_path

ForEachwbInWorkbooks

IfLCase(wb.FullName)=LCase(mywb)Then

SetcheckAndAttachWorkbook=wb

ExitFunction

EndIf

Next

Setwb=Workbooks.Open(in_wb_path,UpdateLinks:=0)

SetcheckAndAttachWorkbook=wb

EndFunction

FunctioncheckAndCloseWorkbook(in_wb_pathAsString,in_savedAsBoolean)

DimwbAsWorkbook

DimmywbAsString

mywb=in_wb_path

ForEachwbInWorkbooks

IfLCase(wb.FullName)=LCase(mywb)Then

wb.Closesavechanges:=in_saved

ExitFunction

EndIf

Next

EndFunction

三,输出结果:

两种方法输出结果相同:

四,比较总结:

Pythonpandas内置了大量处理数据的方法,我们不需要重复造轮子,用起来很方便,代码简洁的多。

ExcelVBA处理这个需求,使用了数组,字典等数据结构(实际需求中,数据量往往很大,所以一些地方没有直接使用遍历单元格的方法),以及处理字符串,数组和字典的很多方法,对文件的操作也很复杂,一旦出错,调试起来比python也较困难,代码已经尽量优化,但还是远比Python要多。