master
/ generate_data.ipynb

generate_data.ipynb @masterview markup · raw · history · blame

Notebook
In [1]:
import pandas as pd

yes_data_path = './mydata/Yes/'
no_data_path  = './mydata/No/'
yes_root_csv = pd.read_csv(yes_data_path+'长乐区.csv')
no_root_csv = pd.read_csv(no_data_path+'闽清县.csv')
no_root_csv.head()
Out[1]:
指 标 2017 2016 2015 2014 2013 2012 2011 2010 2009 2008
0 行政区域面积 1467 1467 1467 1467 1467 1467 1467 1467 1467 1467
1 第一产业增加值 282590 276808 254933 230741 203721 196012 177808 157023 130424 123354
2 第二产业增加值 756832 785238 779507 741551 673699 607700 537500 479300 411956 417242
3 居民储蓄存款余额 993018 870944 840324 738858 645667 571882 526651 445659 383279 317078
4 年末金融机构各项贷款余额 767371 644153 582445 502392 427285 345973 281859 237260 199415 143451
In [3]:
import random

gen_num = 1000

def generate_csv(input_csv, gen_num, data_path):
    for i in range(gen_num):
        # for each row
        num_row = input_csv.shape[0]
        num_col = input_csv.shape[1]
        for row in range(num_row):
            col = random.randint(1,num_col - 1)
            #print('%d,%d'%(row,col))
            #print('previous %d'%input_csv.iloc[row,col])
            input_csv.iloc[row,col] = input_csv.iloc[row,col]*(1.0 + random.randint(-100,100)/1000.)
            #print('previous %d'%input_csv.iloc[row,col])
        input_csv.to_csv(data_path + str(i)+'.csv', index=False)

generate_csv(yes_root_csv, gen_num, yes_data_path)
generate_csv(no_root_csv,  gen_num, no_data_path)