好友
阅读权限10
听众
最后登录1970-1-1
|
import pandas as pd
# 加载数据
data = pd.read_csv("C:/Users/Administrator/Desktop/proteinfpkmshaixuan.txt", sep="\t") # 替换为实际文件路径
# 提取基因ID
gene_ids = data['id']
data = data.set_index('id')
# 1. NotExpressed(未表达):所有组织中表达水平低于 1 的基因
not_expressed = data[(data < 1).all(axis=1)]
# 2. Tissue Enriched(组织富集):某组织中表达量 ≥1,且比其他所有组织高至少 5 倍
tissue_enriched = []
for tissue in data.columns:
enriched = data[
(data[tissue] >= 1) &
(data[tissue] >= 5 * data.drop(columns=[tissue]).max(axis=1))
]
enriched['tissue'] = tissue
tissue_enriched.append(enriched)
tissue_enriched = pd.concat(tissue_enriched) if tissue_enriched else pd.DataFrame()
# 3. Group Enriched(集团丰富):一组 2-7 个组织中表达量 ≥1,且比其他所有组织高至少 5 倍
from itertools import combinations
group_enriched = []
for tissue_count in range(2, 8):
for tissues in combinations(data.columns, tissue_count):
other_tissues = data.columns.difference(tissues)
enriched = data[
(data[list(tissues)].min(axis=1) >= 1) &
(data[list(tissues)].min(axis=1) >= 5 * data[other_tissues].max(axis=1))
]
enriched['group'] = ', '.join(tissues)
group_enriched.append(enriched)
group_enriched = pd.concat(group_enriched) if group_enriched else pd.DataFrame()
# 4. Tissue Enhanced(组织增强):某组织中表达量 ≥1,且比其他组织平均值高至少 5 倍
tissue_enhanced = []
for tissue in data.columns:
enhanced = data[
(data[tissue] >= 1) &
(data[tissue] >= 5 * data.drop(columns=[tissue]).mean(axis=1))
]
enhanced = enhanced[
~enhanced.index.isin(tissue_enriched.index) &
~enhanced.index.isin(group_enriched.index)
]
enhanced['tissue'] = tissue
tissue_enhanced.append(enhanced)
tissue_enhanced = pd.concat(tissue_enhanced) if tissue_enhanced else pd.DataFrame()
# 5. Expressed in all(全部表达):在所有组织中表达量 ≥1 且不属于其他分类
expressed_in_all = data[
(data >= 1).all(axis=1) &
(~data.index.isin(not_expressed.index)) &
(~data.index.isin(tissue_enriched.index)) &
(~data.index.isin(group_enriched.index)) &
(~data.index.isin(tissue_enhanced.index))
]
# 6. Mixed(混合):不属于上述任何一类
mixed = data[
~data.index.isin(pd.concat([not_expressed, tissue_enriched, group_enriched, tissue_enhanced, expressed_in_all]).index)
]
# 输出分类结果
not_expressed.to_csv("NotExpressed_genes.csv")
tissue_enriched.to_csv("TissueEnriched_genes.csv")
group_enriched.to_csv("GroupEnriched_genes.csv")
tissue_enhanced.to_csv("TissueEnhanced_genes.csv")
expressed_in_all.to_csv("ExpressedInAll_genes.csv")
mixed.to_csv("Mixed_genes.csv")
print("分类完成,结果已保存到文件。")
|
|