본문 바로가기
추천 시스템 이론

Pytorch Recommend system github 작동 순서

by 블쭌 2021. 4. 23.
728x90
  • main.py

- argparse.ArgumentParser()를 통한 모델명, 데이터 경로, 저장 경로 입력받기

parser = argparse.ArgumentParser()
parser.add_argument('--name', type=, default=)

conf = parse.parse_args()

- random_seed 미리 배정해놓기

np.random.seed(conf.seed)
torch.random.manual_seed(conf.seed)

- cpu / gpu device 세팅

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

- 전처리된 데이터를 바탕으로 생성된 아이템

# 평가 데이터
eval_pos, eval_target = dataset.eval_data()

# 인기있는 데이터
item_popularity = dataset.item_popularity

# 평가 긍정데이터, 평가 타겟데이터, 인기있는데이터, top_k 개수
evaluator = Evaluator(eval_pos, eval_target, item_popularity, model_conf.top_k)

- model 불러오기

model_base = getattr(models, conf.model)
model = model_base(model_conf, dataset.num_users, dataset.num_items, device)

 


  • utils > params.py

-  Params 함수를 통한 각 모델에 대한 하이퍼파라미터를 .json으로 불러오고 dictionary형태로 update하기


  • utils > DataUtils.py 데이터 전처리 진행

- dataframe을 sparse matrix로 변환

def df_to_sparse(df, shape):
    rows, cols = df.user, df.item
    values = df.ratings
    org_values = df.org_ratings

    sp_data = sp.csr_matrix((values, (rows, cols)), dtype='float64', shape=shape)
    sp_data2 = sp.csr_matrix((org_values, (rows, cols)), dtype='float64', shape=shape)

    num_nonzeros = np.diff(sp_data.indptr)
    rows_to_drop = num_nonzeros == 0
    if sum(rows_to_drop) > 0:
        print('%d empty users are dropped from matrix.' % sum(rows_to_drop))
        sp_data = sp_data[num_nonzeros != 0]

    return sp_data, sp_data2

- train / test split

data_group = data.groupby('user')

train_list, test_list = [], []

num_zero_train = 0
num_zero_test = 0

for _, group in data_group:
	user = pd.unique(group.user)[0]
	num_items_user = len(group)
	num_train = int(train_ratio * num_items_user)
	num_test = num_items_user - num_train

    group = group.sort_values(by='timestamps')

    idx = np.ones(num_items_user, dtype='bool')

    test_idx = np.random.choice(num_items_user, num_test, replace=False)
    idx[test_idx] = False

    if len(group[idx]) == 0:
    	num_zero_train += 1
    else:
    	train_list.append(group[idx])

    if len(group[np.logical_not(idx)]) == 0:
    	num_zero_test += 1
    else:
    	test_list.append(group[np.logical_not(idx)])

train_df = pd.concat(train_list)
test_df = pd.concat(test_list)
print('# zero train, test: %d, %d' % (num_zero_train, num_zero_test))

 

 

 

 


 

  • utils > Dataset.py  데이터 파이프라인 구축
# 추천시스템 기본데이터 movie-lens 사용
if data_name == 'ml-100k':
	sep = '\t'
	filename = 'u.data'
	self.num_users, self.num_items = 943, 1682
elif data_name == 'ml-1m':
	sep = '::'
	filename = 'ratings.dat'
	self.num_users, self.num_items = 6040, 3952

# main에서 data-name을 제대로 적어주지 않을 경우 error띄우기
else:
	raise NotImplementedError('Choose correct dataset: {ml-100k, ml-1m}')

data_path = os.path.join(data_dir, data_name, data_name + '.data')
stat_path = os.path.join(data_dir, data_name, data_name + '.stat')

- __str__함수를 통해서 데이터에 대한 정보 출력

    def __str__(self):
        # return string representation of 'Dataset' class
        # print(Dataset) or str(Dataset)
        ret = '======== [Dataset] ========\n'
        # ret += 'Train file: %s\n' % self.train_file
        # ret += 'Test file : %s\n' % self.test_file
        ret += 'Number of Users : %d\n' % self.num_users
        ret += 'Number of items : %d\n' % self.num_items
        ret += 'Split ratio: %s\n' % str(self.train_ratio)
        ret += '\n'
        return ret

- sparse_to_dict: sparse한 matrix에서 indptr 접근을 이용해서 dictionary형태로 user:[item]을 만들어준다.

def sparse_to_dict(self, sparse_matrix):
  ret_dict = {}
  num_users = sparse_matrix.shape[0]
  
  for u in range(num_users):
 	items_u = sparse_matrix.indices[sparse_matrix.indptr[u]: sparse_matrix.indptr[u + 1]]
  	ret_dict[u] = items_u.tolist()
    
  return ret_dict

- positive sample과 negative sample 생성하는 함수

    def generate_pairwise_data_from_matrix(self, rating_matrix, num_negatives=1, p=None):
        num_users, num_items = rating_matrix.shape

        users = []
        positives = []
        negatives = []
        for user in range(num_users):
            if p is None:
                start = rating_matrix.indptr[user]
                end = rating_matrix.indptr[user + 1]
                pos_index = rating_matrix.indices[start:end]
                num_positives = len(pos_index)
                if num_positives == 0:
                    print('[WARNING] user %d has 0 ratings. Not generating negative samples.' % user)
                    continue

                num_all_negatives = num_items - num_positives
                prob = np.full(num_items, 1 / num_all_negatives)
                prob[pos_index] = 0.0

            neg_items = np.random.choice(num_items, num_positives * num_negatives, replace=True, p=prob)
            for i, pos in enumerate(pos_index):
                users += [user] * num_negatives
                positives += [pos] * num_negatives
                negatives += neg_items[i * num_negatives: (i + 1) * num_negatives].tolist()

        if self.device == 'cpu':
            return torch.LongTensor(users), torch.LongTensor(positives), torch.LongTensor(negatives)

        elif self.device == 'gpu':
            return torch.cuda.LongTensor(users), torch.cuda.LongTensor(positives), torch.cuda.LongTensor(negatives)

- Trainer

def train(self):
	# 로그 기록
	self.logger.info(self.conf)
	
    # 모델에서 학습시킬 파라미터가 있다면
    if len(list(self.model.parameters())) > 0:
    	# Adam optimzer 진행
		optimizer = torch.optim.Adam(self.model.parameters(), self.lr)
    else:
    	optimizer = None
        
score_table = Table(table_name='Scores')

for epoch in range(1, self.num_epochs + 1):
	# train for an epoch
	epoch_start = time.time()
    
    # 모델에서 하나의 epoch이후 나온 loss 기록
    # 모델마다 train_one_epoch 구현해놓았음
	loss = self.model.train_one_epoch(self.dataset, optimizer, self.batch_size, False)
	train_elapsed = time.time() - epoch_start

	
728x90

댓글