Skip to content

Commit c2d9db0

Browse files
committed
Remove duplicates from HCMI
this resolves #397 because genes are not counted multiple times.
1 parent 7727fbd commit c2d9db0

2 files changed

Lines changed: 6 additions & 3 deletions

File tree

build/hcmi/02-getHCMIData.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -613,10 +613,13 @@ def write_dataframe_to_csv(dataframe, outname):
613613
-------
614614
None
615615
"""
616+
dataframe = dataframe.to_pandas()
617+
dataframe = dataframe.drop_duplicates()
618+
616619
if('gz' in outname):
617-
dataframe.to_pandas().to_csv(outname,compression='gzip',index=False)
620+
dataframe.to_csv(outname,compression='gzip',index=False)
618621
else:
619-
dataframe.to_pandas().to_csv(outname,index=False)
622+
dataframe.to_csv(outname,index=False)
620623
return
621624

622625
def main():

build/utils/tpmFromCounts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def main(counts_data, genome_link, gene_column, exclude_columns, out_file):
7878
if __name__=='__main__':
7979
parser = argparse.ArgumentParser("Quick script to get TPM from counts matrix")
8080

81-
parser.add_argument('--counts', default=None, help='Transcriptomics counts matrix')
81+
parser.add_argument('--counts', default=None, help='Transcriptomics counts matrix where rows represent gene values and columns represent samples')
8282
parser.add_argument('--genome_build', default="https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz", help='Link to human genome build')
8383
parser.add_argument('--gene_col', default="stable_id", help='Name of column with gene names')
8484
parser.add_argument('--exclude_col', default="stable_id,display_label,description,biotype", help='Name of column with gene names')

0 commit comments

Comments
 (0)