技术标签: 图神经网络 python 机器学习 人工智能 开发语言
在网上找数据集的时候无意中看到了有这么个比赛,伴随着好奇的心里以及尝试的心态就参加试一试。这次比赛出现了一些让我意想不到的结果特此记录一下。
基于中国联通的大数据能力,通过使用对联通的信令数据、通话数据、互联网行为等数据进行建模,对个人是否会返乡工作进行判断。以上为官网的背景介绍。以论文的角度看应该是要参考一些人口流动预测的场景。
数据集介绍:
这个赛题已经有大佬在网上开源了baseline方案。大佬的解决方案的大致思路是:
train_data['f47'] = train_data['f1'] * 10 + train_data['f2']
test_data['f47'] = test_data['f1'] * 10 + test_data['f2']
loc_f = ['f1', 'f2', 'f4', 'f5', 'f6']
for df in [train_data, test_data]:
for i in range(len(loc_f)):
for j in range(i + 1, len(loc_f)):
df[f'{
loc_f[i]}+{
loc_f[j]}'] = df[loc_f[i]] + df[loc_f[j]]
df[f'{
loc_f[i]}-{
loc_f[j]}'] = df[loc_f[i]] - df[loc_f[j]]
df[f'{
loc_f[i]}*{
loc_f[j]}'] = df[loc_f[i]] * df[loc_f[j]]
df[f'{
loc_f[i]}/{
loc_f[j]}'] = df[loc_f[i]] / (df[loc_f[j]]+1)
com_f = ['f43', 'f44', 'f45', 'f46']
for df in [train_data, test_data]:
for i in range(len(com_f)):
for j in range(i + 1, len(com_f)):
df[f'{
com_f[i]}+{
com_f[j]}'] = df[com_f[i]] + df[com_f[j]]
df[f'{
com_f[i]}-{
com_f[j]}'] = df[com_f[i]] - df[com_f[j]]
df[f'{
com_f[i]}*{
com_f[j]}'] = df[com_f[i]] * df[com_f[j]]
df[f'{
com_f[i]}/{
com_f[j]}'] = df[com_f[i]] / (df[com_f[j]]+1)
cat_columns = ['f3']
data = pd.concat([train_data, test_data])
for col in cat_columns:
lb = LabelEncoder()
lb.fit(data[col])
train_data[col] = lb.transform(train_data[col])
test_data[col] = lb.transform(test_data[col])
num_columns = [ col for col in train_data.columns if col not in ['id', 'label', 'f3']]
feature_columns = num_columns + cat_columns
target = 'label'
train = train_data[feature_columns]
label = train_data[target]
test = test_data[feature_columns]
print(train.shape)
print(train)
features = [i for i in train.columns if i not in ['label', 'id']]
y = train['label']
KF = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
feat_imp_df = pd.DataFrame({
'feat': features, 'imp': 0})
params = {
'objective': 'binary',
'boosting_type': 'gbdt',
'metric': 'auc',
'n_jobs': 30,
'learning_rate': 0.05,
'num_leaves': 2 ** 6,
'max_depth': 8,
'tree_learner': 'serial',
'colsample_bytree': 0.8,
'subsample_freq': 1,
'subsample': 0.8,
'num_boost_round': 5000,
'max_bin': 255,
'verbose': -1,
'seed': 2021,
'bagging_seed': 2021,
'feature_fraction_seed': 2021,
'early_stopping_rounds': 100,
}
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros((len(test)))
# 模型训练
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train.values, y.values)):
print("fold n°{}".format(fold_))
trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=y.iloc[trn_idx])
val_data = lgb.Dataset(train.iloc[val_idx][features], label=y.iloc[val_idx])
num_round = 3000
clf = lgb.train(
params,
trn_data,
num_round,
valid_sets=[trn_data, val_data],
verbose_eval=100,
early_stopping_rounds=50,
)
oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
predictions_lgb[:] += clf.predict(test[features], num_iteration=clf.best_iteration) / 5
feat_imp_df['imp'] += clf.feature_importance() / 5
print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
print("F1 score: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Precision score: {}".format(precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Recall score: {}".format(recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
这里就不放完整的大佬代码了,有需要的可以去赛题的评论区里面自行获取。
以上是大佬们开源的解决方案,就我个人感觉方案存在一些小问题(虽然我的排名非常垃圾可是我依旧想这样说一下我的看法):
以上是我对场景的理解。也就是说我认为解决预测问题的需要一种可以自适应的挖掘出特征之间对目标的重要程度,并且可以使用未标注数据集,而且还可以自动挖掘出样本之间的相关性的模型。
引入库文件
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset,DataLoader
import torch_geometric
import pandas as pd
import networkx as ntx
import torch
import numpy as np
import torch.nn as nn
import math
import torch.nn.functional as F
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics import Accuracy
from torchmetrics import AUROC
#from torchmetrics.classification import BinaryAccuracy
import torch
import numpy as np
import torch.nn as nn
import math
import torch.nn.functional as F
引入数据集
batchsize=1024##batchsize直接和图的邻接矩阵的规模相关不能太小
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class MyDataset(torch.utils.data.Dataset):
def __init__(self, root,datatype,num=49858):
name1= os.path.join(root,'dataTrain.csv')
name2= os.path.join(root,'dataNoLabel.csv')
name3 =os.path.join(root,'dataA.csv')
name=''
if(datatype=='train'):
name=name1
else:
name=name3
df=pd.read_csv(name)
df1=pd.read_csv(name1)
df2=pd.read_csv(name2)
df3=pd.read_csv(name3)
self.data2=df2.insert(loc=df2.shape[1], column='label', value=2)
self.data3=df3.insert(loc=df3.shape[1], column='label', value=3)
data = pd.DataFrame(df,columns=['f3'])
dummies = pd.get_dummies(data)
for index,row in df.iteritems():
if(index not in ['id','label','f3']):
a=np.min(np.array(df1[[index]]))
b=np.max(np.array(df1[[index]]))
df[[index]]=df[[index]].apply(lambda x : (x-a)/(b-a))
for index, row in dummies.iteritems():
# print(index)
# print(dummies[index])
# print(row)
df.insert(loc=3, column=index, value=row.tolist())
df=df.drop(columns='f3',inplace=False)##在里面f3字段是一个离散特征,我们将它转换为onehot编码
print(df.shape)
# self.traindata=df.sample(n=num, frac=None, replace=False, weights=None, random_state=None, axis=0)
# print(self.traindata.shape)
# self.testdata=df[~df.index.isin(self.traindata.index)]
# print(self.testdata.shape)
self.data=df
self.datatype=datatype
def getdata(self,index,df):
a=df.iloc[index,1:49].values.tolist()
b=df.iloc[index,49:].values.tolist()
a = [float(i) for i in a]
b = [float(i) for i in b]
X=torch.tensor(a,dtype=torch.float32)
# X=X.unsqueeze(-1)
Y=torch.tensor(b,dtype=torch.float32)
return X,Y
def __getitem__(self, index):
samples, labels=self.getdata(index,self.data)
sample=[samples,labels]
return sample
def __len__(self):
return self.data.shape[0]
traindata=MyDataset(root='./data/person/',datatype='train')
print(len(traindata))
testdata=MyDataset(root='./data/person/',datatype='test')
print(len(testdata))
train_size = int(len(traindata) * 0.9)
test_size = len(traindata) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(traindata, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=batchsize, shuffle=False)
print(len(train_dataset))
print(len(test_dataset))
print(len(testdata))
#for step, (input,label) in enumerate(train_loader):
#print(input.shape)
#print(label.shape)
print('+++++++++++++++++++++test++++++++++++++++++++')
test_loader = DataLoader(test_dataset, batch_size=batchsize, shuffle=False)
#for step, (input,label) in enumerate(test_loader):
#print(input.shape)
#print(label.shape)
多头注意力机制
class selfAttention(nn.Module) :
def __init__(self, num_attention_heads, input_size, hidden_size):
super(selfAttention, self).__init__()
if hidden_size % num_attention_heads != 0 :
raise ValueError(
"the hidden size %d is not a multiple of the number of attention heads"
"%d" % (hidden_size, num_attention_heads)
)
self.num_attention_heads = num_attention_heads
self.attention_head_size = int(hidden_size / num_attention_heads)
self.all_head_size = hidden_size
self.key_layer = nn.Linear(input_size, hidden_size)
self.query_layer = nn.Linear(input_size, hidden_size)
self.value_layer = nn.Linear(input_size, hidden_size)
def trans_to_multiple_heads(self, x):
new_size = x.size()[ : -1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_size)
return x.permute(0, 2, 1, 3)
def forward(self, x):
key = self.key_layer(x)
query = self.query_layer(x)
value = self.value_layer(x)#kqv
key_heads = self.trans_to_multiple_heads(key)
query_heads = self.trans_to_multiple_heads(query)
value_heads = self.trans_to_multiple_heads(value)
attention_scores = torch.matmul(query_heads, key_heads.permute(0, 1, 3, 2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
attention_probs = F.softmax(attention_scores, dim = -1)
context = torch.matmul(attention_probs, value_heads)
context = context.permute(0, 2, 1, 3).contiguous()
new_size = context.size()[ : -2] + (self.all_head_size , )
context = context.view(*new_size)
return context
图学习层
class attgraNet(nn.Module):
def __init__(self,inputsize,batchsize,k):
super(attgraNet, self).__init__()
self.k=k
self.batchsize=batchsize
self.inputsize=inputsize
self.fc1 = nn.Linear(inputsize,inputsize*10)
self.att = selfAttention(4,10,48)
#self.para = torch.nn.Parameter(torch.ones([2,batchsize]), requires_grad=True)
#四层GAT
self.gat1=torch_geometric.nn.GATConv(48*48,16,16,dropout=0.6)
self.act1=nn.LeakyReLU(0.1)
self.gat2=torch_geometric.nn.GATConv(256,8,8,dropout=0.6)
self.act2=nn.LeakyReLU(0.1)
self.gat3=torch_geometric.nn.GATConv(64,8,8,dropout=0.6)
self.act3=nn.LeakyReLU(0.1)
self.gat4=torch_geometric.nn.GATConv(64,16,16,dropout=0.6)
self.act4=nn.LeakyReLU(0.1)
self.fc2 = nn.Sequential(nn.Linear(16*16, 84),nn.LeakyReLU(0.1),nn.BatchNorm1d(84))
self.fc3 = nn.Linear(84, 2)
def forward(self, x):
#print(x.size())
print(x.device)
x = self.fc1(x)
x = x.unsqueeze(-1)
x = x.reshape(-1,self.inputsize,10)
#print(x.size())
x = self.att(x)
x = x.reshape(x.shape[0],-1,1)
#print(x.size())
a=x
dim0, dim1,dim2 = a.shape
#print(dim0)
#print(dim1)
#print(dim2)
para = torch.ones([2,a.shape[0]],dtype=torch.long).to(device)
#print(para.shape)
for i in range(dim0):
score=torch.zeros(dim0)
for j in range(dim0):
if(i!=j):
#print(a[i].shape)
score[j]=torch.abs(torch.cosine_similarity(a[i], a[j], dim=0))
#print(score)
#print(torch.argmax(score, dim=0))
for j in range(self.k):
idx=torch.argmax(score, dim=0)
para.data[0][i]=i
para.data[1][i]=idx
score[idx]=0
#构造邻接矩阵
x = x.reshape(dim0,-1)
data = torch_geometric.data.Data(x=x, edge_index=para.long()).to(device)
#print(data.x.shape)
#print(data.edge_index.shape)
#print(data.x.is_cuda)
#print(data.edge_index.is_cuda)
x = self.gat1(data.x,data.edge_index)
x = self.act1(x)
#print(x.shape)
#print(data.edge_index.shape)
x = self.gat2(x,data.edge_index)
#print(x.shape)
#print(data.edge_index.shape)
x = self.act2(x)
x = self.gat3(x,data.edge_index)
x = self.act3(x)
x = self.gat4(x,data.edge_index)
x = self.act4(x)
x = self.fc2(x)
x = F.dropout(x, training=self.training)
x = self.fc3(x)
return F.log_softmax(x,dim=1)
训练
net = attgraNet(48,batchsize,5)
print(net)
net.to(device)
print(next(net.parameters()).device)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(net.parameters())
print(torch.cuda.is_available())
def train(epoch,model):
model.train()
running_loss = 0.0
for i,(X, y) in enumerate(train_loader):
input = X.to(device)
y=y.to(device)
output = model.forward(input)
#print(output.shape)
#print(y.squeeze(dim=1).shape)
loss = criterion(output , y.squeeze(dim=1).long())
print("[{}, {}] loss %{}':".format(epoch,i,loss))
running_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss_train = running_loss / (len(train_dataset)/batchsize)
print(epoch_loss_train)
return epoch_loss_train
测试
def val(model):
model.eval()
running_loss = 0.0
n=0
result=0
resauroc=0
metric = Accuracy(top_k=1)
#auroc = AUROC(num_classes=2)
#应该以AUC来评判,但是我如果使用torchmetric中的AUC汇报GPU资源不够的错误
with torch.no_grad():
for i, (data,y) in enumerate(test_loader):
input = data.to(device)
y=y.to(device)
optimizer.zero_grad()
output = model.forward(input)
#print(y.squeeze(dim=1).long().shape)
#print(output.shape)
loss= criterion(output,y.squeeze(dim=1).long())
print("[{}] loss %{}':".format(i,loss))
n=n+1
res=metric(output.cpu(),y.squeeze(dim=1).long().cpu())
#res1=AUROC(output.cpu(),y.squeeze(dim=1).long().cpu())
print("[{}] ACC %{}':".format(i,res))
#print("[{}] AUROC %{}':".format(i,res1))
result=res+result
#resauroc=res1+resauroc
running_loss += loss.item()
epoch_loss_val = running_loss / (len(test_dataset)/batchsize)
print(epoch_loss_val)
print(result/n)
#print(resauroc/n)
return epoch_loss_val
val(net)
其余代码
def main(model):
min_loss = 100000000.0
loss_train = []
loss_val = []
epochs=200
since = time.time()
for epoch in range(epochs):
epoch_loss_train = train(epoch,model)
loss_train.append(epoch_loss_train)
epoch_loss_val = val(model)
loss_val.append(epoch_loss_val)
if epoch_loss_val < min_loss:
min_loss = epoch_loss_val
best_model_wts = model.state_dict()
#torch.save(best_model_wts,os.path.join(parameter_address, experiment_name + '.pkl'))
torch.save(model.state_dict(),'bestsaveBIG.pt')
model_wts = model.state_dict()
#torch.save(model_wts,os.path.join(parameter_address, experiment_name + "_" + str(epoch) + '.pkl'))
time_elapsed = time.time() - since
#torch.save(model,str(epoch)+'.pt')
torch.save(model.state_dict(),'lastsaveBIG.pt')
if __name__ == "__main__":
main(net)
print('train finish')
Tips
文章浏览阅读3.5k次,点赞2次,收藏13次。为了从FTP服务器下载文件,需要要实现一个简单的FTP客户端。FTP(文件传输协议) 是 TCP/IP 协议组中的应用层协议。FTP协议使用字符串格式命令字,每条命令都是一行字符串,以“\r\n”结尾。客户端发送格式是:命令+空格+参数+"\r\n"的格式服务器返回格式是以:状态码+空格+提示字符串+"\r\n"的格式,代码只要解析状态码就可以了。读写文件需要登陆服务器,特殊用..._ftp 登录返回230
文章浏览阅读648次。前提:systemctl stop firewalld 关闭防火墙关闭selinux查看getenforce临时关闭setenforce 0永久关闭sed-i'/SELINUX/s/enforcing/disabled/'/etc/selinux/configselinux的三种模式enforcing:强制模式,SELinux 运作中,且已经正确的开始限制..._centos7 安装rabbitmq3.6.5
文章浏览阅读5.8k次。满意答案s55f2avsx2017.09.05采纳率:46%等级:12已帮助:5646人新版Android Studio/IntelliJ IDEA可以直接导入eclipse项目,不再推荐使用eclipse导出gradle的方式2启动Android Studio/IntelliJ IDEA,选择 import project3选择eclipse 项目4选择 create project f..._android studio 项目导入idea 看不懂安卓项目
文章浏览阅读860次,点赞2次,收藏6次。AI大模型技术已经在自然语言处理、计算机视觉、多模态交互等领域取得了显著的进展和成果,同时也引发了一系列新的挑战和问题,如数据质量、计算效率、知识可解释性、安全可靠性等。城市运维涉及到多个方面,如交通管理、环境监测、公共安全、社会治理等,它们需要处理和分析大量的多模态数据,如图像、视频、语音、文本等,并根据不同的场景和需求,提供合适的决策和响应。知识搜索有多种形式,如语义搜索、对话搜索、图像搜索、视频搜索等,它们可以根据用户的输入和意图,从海量的数据源中检索出最相关的信息,并以友好的方式呈现给用户。_ai大模型应用开发
文章浏览阅读8.2k次,点赞12次,收藏121次。为什么要测量阻抗呢?阻抗能代表什么?阻抗测量的注意事项... ...很多人可能会带着一系列的问题来阅读本文。不管是数字电路工程师还是射频工程师,都在关注各类器件的阻抗,本文非常值得一读。全文13000多字,认真读完大概需要2小时。一、阻抗测试基本概念阻抗定义:阻抗是元器件或电路对周期的交流信号的总的反作用。AC 交流测试信号 (幅度和频率)。包括实部和虚部。图1 阻抗的定义阻抗是评测电路、元件以及制作元件材料的重要参数。那么什么是阻抗呢?让我们先来看一下阻抗的定义。首先阻抗是一个矢量。通常,阻抗是_阻抗实部和虚部
文章浏览阅读955次。前面章节分享试用了pyzero,pygame但随着想增加更丰富的游戏内容,好多还要进行自己编写类,从今天开始解绍一个新的python游戏库arcade模块。通过此次的《连连看》游戏实现,让我对swing的相关知识有了进一步的了解,对java这门语言也有了比以前更深刻的认识。java的一些基本语法,比如数据类型、运算符、程序流程控制和数组等,理解更加透彻。java最核心的核心就是面向对象思想,对于这一个概念,终于悟到了一些。_arcade语言 like
文章浏览阅读1.1k次。源码简介与安装说明:2021增强版短视频去水印源码 去水印微信小程序源码网站 去水印软件源码安装环境(需要材料):备案域名–服务器安装宝塔-安装 Nginx 或者 Apachephp5.6 以上-安装 sg11 插件小程序已自带解析接口,支持全网主流短视频平台,搭建好了就能用注:接口是公益的,那么多人用解析慢是肯定的,前段和后端源码已经打包,上传服务器之后在配置文件修改数据库密码。然后输入自己的域名,进入后台,创建小程序,输入自己的小程序配置即可安装说明:上传源码,修改data/_去水印机要增强版
文章浏览阅读557次。1. 触发器是FPGA存储数据的基本单元2. 触发器作为时序逻辑的基本元件,官方提供了丰富的配置方式,以适应各种可能的应用场景。_fdre #(.init(1'b0) // initial value of register (1'b0 or 1'b1) ) fdce_osc (
文章浏览阅读560次。本该是不同编译器结果不同,但是尝试了g++ msvc都是先计算c,再计算b,最后得到a+b+c是经过赋值以后的b和c参与计算而不是6。由上表可知,将q复制到p数组可以表示为:*p++=*q++,*优先级高,先取到对应q数组的值,然后两个++都是在后面,该行运算完后执行++。在电脑端编译完后会分为text data bss三种,其中text为可执行程序,data为初始化过的ro+rw变量,bss为未初始化或初始化为0变量。_嵌入式面试笔试c语言知识点
文章浏览阅读2.3k次。57 Things I've Learned Founding 3 Tech CompaniesJason Goldberg, Betashop | Oct. 29, 2010, 1:29 PMI’ve been founding andhelping run techn_mature
文章浏览阅读1.9k次。问题:先讲下需求,有若干个文本文件(txt或者csv文件等),每行代表一条数据,现在希望能合并成 1 个文本文件,且需要去除重复行。分析:一向奉行简单原则,如无必要,绝不复杂。如果数据量不大,那么如下两条命令就可以搞定合并:cat a.txt >> new.txtcat b.txt >> new.txt……去重:cat new...._python 超大文本合并
文章浏览阅读489次。这个过渡页是第一次打开小程序展示的,点击某个小程序前把手机的开发者->network link conditioner->enable & very bad network 就会在停在此页。比如《支付宝运动》这个小程序先看这个类的.h可以看到它继承于DTViewController点击左上角返回的方法- (void)back;#import "DTViewController.h"#import "APBaseLoadingV..._类似支付宝页面过度加载页