{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "50887e04-45df-4d6d-aef9-801633ad0a18", "metadata": {}, "outputs": [], "source": [ "from optparse import OptionParser\n", "import sys\n", "import pickle\n", "from itertools import islice, chain\n", "import gzip\n", "from collections import defaultdict\n", "import numpy as np\n", "from collections import Counter\n", "import matplotlib.pyplot as plt\n", "\n", "import regex as re\n", "\n", "import Levenshtein" ] }, { "cell_type": "code", "execution_count": null, "id": "0d8db3ef-5be9-4376-b90c-9915ad5c35da", "metadata": {}, "outputs": [], "source": [ "#inputs for Guide RNA quantification\n", "bcs = np.loadtxt(\"cb.txt\", dtype=str) #load cell barcodes extracted from seurat object built on RNA\n", "guides = np.loadtxt(\"guides.txt\", dtype=str) #load guides \n", "guide_seq = guides[:,1] #extract sequences of each guides\n", "guide_seq = np.char.upper(guide_seq) #convert sequences to upper case\n", "guides = guides[:,0] #extract guide name\n", "print(guide_seq)\n", "print(guides)\n", "guide_cr = \"GCTATGCTGTTTCCAGCTTAGCTCTTAAAC\" #constant region to look for\n", "umi_n_muts = 1 #how many mutations to allow when classifying a read as a unique molecule\n", "\n", "r1_file = \"R1.fastq.gz\" #put path to your R1 file here\n", "r2_file = \"R2.fastq.gz\" #put path to your R2 file here\n", "\n", "guide_length = len(guide_seq[1])\n", "print(guide_length)" ] }, { "cell_type": "code", "execution_count": null, "id": "b705b47e-02af-4ca7-83e2-194e2ca261b1", "metadata": {}, "outputs": [], "source": [ "# hamming distance function\n", "# we use this to assign CB/UMI, as well as guide identity\n", "def _hamming(s1, s2) :\n", " d = 0.\n", " for j in range(len(s1)) :\n", " if s1[j] != s2[j] :\n", " d += 1.\n", " return d" ] }, { "cell_type": "code", "execution_count": null, "id": "b1175fb0-88a1-4d6b-a69a-956bf1cf2857", "metadata": {}, "outputs": [], "source": [ "#Build dictionary of cell barcodes (single-mutation support)\n", "#the keys are equal to all cell barcodes (and their 1bp mutants, assuming this doesn't collide with \n", "#another real barcode\n", "bases = ['A', 'C', 'G', 'T'] \n", "barcode_dict = {}\n", "sequences = []\n", "for i in range(len(bcs)):\n", " bc = bcs[i]\n", " barcode_dict[bc] = i\n", " for pos1 in range(len(bc)) :\n", " for b1 in bases:\n", " bc_mut = bc[:pos1] + b1 + bc[pos1+1:]\n", " if bc_mut in barcode_dict and barcode_dict[bc_mut] != i :\n", " del barcode_dict[bc_mut] \n", " else:\n", " barcode_dict[bc_mut] = i" ] }, { "cell_type": "code", "execution_count": null, "id": "93a8d434-6193-4cad-a6b7-b8b48784f522", "metadata": {}, "outputs": [], "source": [ "#build ALT2 dictionary of guides\n", "#now only the original guide sequences are used as keys\n", "gdict = {}\n", "sequences = []\n", "#for i in range(len(guide_seq)):\n", "for i, g in enumerate(guide_seq):\n", " # Directly map the original guide sequence to its index\n", " g = guide_seq[i]\n", " gdict[g] = i" ] }, { "cell_type": "code", "execution_count": null, "id": "10e1e511-3fd6-469f-b267-a1d0f5007e63", "metadata": {}, "outputs": [], "source": [ "#initialize CB x guides matrix\n", "guide_m = np.zeros((len(bcs), len(guide_seq))) \n", "\n", "#make dict to keep track of what umis have been seen for each CB\n", "umi_dict = {}\n", "for bc in bcs:\n", " umi_dict[bc] = {}\n", " \n", "r1 = gzip.open(r1_file)\n", "r2 = gzip.open(r2_file)\n", "\n", "#initialize umi counting\n", "total_line = 0\n", "skips = 0 \n", "not_skips = 0\n", "guide_reads = 0\n", "\n", "#while total_line < 100000: \n", "# recommend this for debugging if you want to just run a subset of lines\n", "while True : #comment this out if you dont' want to process the entire file\n", " read_name = r2.readline()\n", " if len(read_name) == 0: \n", " break\n", " seq = r2.readline().strip()\n", " seq = seq.decode('utf8')\n", " strand = r2.readline().strip()\n", " quality = r2.readline().strip()\n", "\n", " read1_name = r1.readline().strip()\n", " read1_seq = r1.readline().strip()\n", " read1_seq = read1_seq.decode('utf8')\n", " read1_strand = r1.readline().strip()\n", " read1_quality = r1.readline().strip()\n", "\n", " total_line += 1\n", " if total_line % 1000000 == 0 :\n", " print(\"Processing read \" + str(total_line) + \"...\")\n", "\n", " #check the hamming distance between the constant region and the beginning of the read\n", " hamming_dist = _hamming(seq[0:len(guide_cr)], guide_cr)\n", " if hamming_dist >3:\n", " continue\n", " else: \n", " guide_reads +=1\n", " gseq = seq[len(guide_cr):len(guide_cr) + guide_length]\n", " #print(guide_seq)\n", " #print(len(guide_seq))\n", "\n", " #now check which guide matches sequence\n", " if gseq in gdict:\n", " g = guides[gdict[gseq]] #get guide identity allowing 1bp mismatch\n", " #correct cell barcode\n", " cb = read1_seq[0:16] \n", " if cb in barcode_dict:\n", " cb_correct = bcs[barcode_dict[cb]]\n", " #now check UMI\n", " umi = read1_seq[16:16+12] \n", " #add the guide to the umi dictionary\n", " if g not in umi_dict[cb_correct] :\n", " umi_dict[cb_correct][g] = {}\n", " \n", " #increment the count if that UMI has been seen for that barcode\n", " umi_visited = False\n", " if umi in umi_dict[cb_correct][g]: \n", " umi_visited = True\n", " umi_dict[cb_correct][g][umi] += 1\n", " #now check mutations \n", " if umi_n_muts == 1 and not umi_visited:\n", " for pos1 in range(len(umi)):\n", " for b1 in bases :\n", " umi_mut = umi[:pos1] + b1 + umi[pos1+1:]\n", " if umi_mut in umi_dict[cb_correct][g] :\n", " umi_visited = True\n", " umi_dict[cb_correct][g][umi_mut] += 1\n", " break #don't check every other UMI mutant\n", " if umi_visited : \n", " break #don't check every other UMI mutant\n", "\n", " #if we've encounted this UMI before for this guide/cell, \n", " #then we skip this read\n", " if umi_visited: #count number of times we skip a read \n", " skips += 1\n", " continue \n", " #if it's a new UMI, increment count matrix appropriately\n", " else: \n", " not_skips += 1\n", " umi_dict[cb_correct][g][umi] = 1\n", " guide_m[barcode_dict[cb], gdict[gseq]] += 1.\n", " else:\n", " continue\n" ] }, { "cell_type": "code", "execution_count": null, "id": "faca1150-b1aa-46a7-98fd-b5223b54ac16", "metadata": {}, "outputs": [], "source": [ "print(guide_m.sum())\n", "#print(guide_reads)" ] }, { "cell_type": "code", "execution_count": null, "id": "6382afec-eb70-427b-8177-8b165e9b12e5", "metadata": {}, "outputs": [], "source": [ "#save guide matrix\n", "np.savetxt('guide_counts' + '.txt', guide_m, fmt='%s')" ] }, { "cell_type": "code", "execution_count": null, "id": "c55d8194-95f9-457e-8d35-ac3a8ea23415", "metadata": {}, "outputs": [], "source": [ "#inputs for ADT quantification - since this uses the same script, i kept the variable names the same even though it is ADTs not guides\n", "bcs = np.loadtxt(\"cb.txt\", dtype=str) #load cell barcodes \n", "guides = np.loadtxt(\"adt.txt\", dtype=str) #load guides \n", "guide_seq = guides[:,1] #extract sequences of each guides\n", "guide_seq = np.char.upper(guide_seq) #convert sequences to upper case\n", "guides = guides[:,0] #extract guide name\n", "print(guide_seq)\n", "print(guides)\n", "guide_cr = \"GCTTTAAGGCCGGTCC\" #constant region to look for\n", "umi_n_muts = 1 #how many mutations to allow when classifying a read as a unique molecule\n", "\n", "\n", "r1_file = \"R1.fastq.gz\" #put path to your R1 file here\n", "r2_file = \"R2.fastq.gz\" #put path to your R2 file here\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "18da52b4-addf-4388-814c-c9eee7e01b1a", "metadata": {}, "outputs": [], "source": [ "#Build dictionary of cell barcodes (single-mutation support)\n", "#the keys are equal to all cell barcodes (and their 1bp mutants, assuming this doesn't collide with \n", "#another real barcode\n", "bases = ['A', 'C', 'G', 'T'] \n", "barcode_dict = {}\n", "sequences = []\n", "for i in range(len(bcs)):\n", " bc = bcs[i]\n", " barcode_dict[bc] = i\n", " for pos1 in range(len(bc)) :\n", " for b1 in bases:\n", " bc_mut = bc[:pos1] + b1 + bc[pos1+1:]\n", " if bc_mut in barcode_dict and barcode_dict[bc_mut] != i :\n", " del barcode_dict[bc_mut] \n", " else:\n", " barcode_dict[bc_mut] = i" ] }, { "cell_type": "code", "execution_count": null, "id": "2c9a8a60-bbe4-484b-a279-264ce6aacf56", "metadata": {}, "outputs": [], "source": [ "#build dictionary of guides\n", "#similar structure where the keys are all sequences within 1bp edits of guide, \n", "#and values correspond to index of original guide sequence\n", "gdict = {}\n", "sequences = []\n", "#for i in range(1):\n", "for i in range(len(guide_seq)):\n", " g = guide_seq[i]\n", " gdict[g] = i\n", " for pos1 in range(len(g)):\n", " for b1 in bases:\n", " g_mut = g[:pos1] + b1 + g[pos1+1:]\n", " if g_mut in gdict and gdict[g_mut] != i:\n", " print(\"[ERROR] Barcode dictionary collision.\")\n", " continue\n", " else: \n", " #print(g_mut)\n", " gdict[g_mut] = i\n", " #print(g_mut)\n", " \n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "7bc155fb-e8f3-4f3f-b78e-39eb4871cc6d", "metadata": {}, "outputs": [], "source": [ "#initialize CB x guides matrix\n", "guide_m = np.zeros((len(bcs), len(guide_seq))) \n", "\n", "#make dict to keep track of what umis have been seen for each CB\n", "umi_dict = {}\n", "for bc in bcs:\n", " umi_dict[bc] = {}\n", " \n", "r1 = gzip.open(r1_file)\n", "r2 = gzip.open(r2_file)\n", "\n", "#initialize umi counting\n", "total_line = 0\n", "skips = 0 \n", "not_skips = 0\n", "guide_reads = 0\n", "\n", "#while total_line < 100000: \n", "# recommend this for debugging if you want to just run a subset of lines\n", "while True : #comment this out if you dont' want to process the entire file\n", " read_name = r2.readline()\n", " if len(read_name) == 0: \n", " break\n", " seq = r2.readline().strip()\n", " seq = seq.decode('utf8')\n", " strand = r2.readline().strip()\n", " quality = r2.readline().strip()\n", "\n", " read1_name = r1.readline().strip()\n", " read1_seq = r1.readline().strip()\n", " read1_seq = read1_seq.decode('utf8')\n", " read1_strand = r1.readline().strip()\n", " read1_quality = r1.readline().strip()\n", "\n", " total_line += 1\n", " if total_line % 1000000 == 0 :\n", " print(\"Processing read \" + str(total_line) + \"...\")\n", "\n", " #check the hamming distance between the constant region and the beginning of the read\n", " hamming_dist = _hamming(seq[34:50], guide_cr)\n", " if hamming_dist >3:\n", " continue\n", " else: \n", " guide_reads +=1\n", " gseq = seq[10:25]\n", " #print(guide_seq)\n", " #print(len(guide_seq))\n", "\n", " #now check which guide matches sequence\n", " if gseq in gdict:\n", " g = guides[gdict[gseq]] #get guide identity allowing 1bp mismatch\n", " #correct cell barcode\n", " cb = read1_seq[0:16] \n", " if cb in barcode_dict:\n", " cb_correct = bcs[barcode_dict[cb]]\n", " #now check UMI\n", " umi = read1_seq[16:16+12] #lol i hard-coded this\n", " #add the guide to the umi dictionary\n", " if g not in umi_dict[cb_correct] :\n", " umi_dict[cb_correct][g] = {}\n", " \n", " #increment the count if that UMI has been seen for that barcode\n", " umi_visited = False\n", " if umi in umi_dict[cb_correct][g]: \n", " umi_visited = True\n", " umi_dict[cb_correct][g][umi] += 1\n", " #now check mutations \n", " if umi_n_muts == 1 and not umi_visited:\n", " for pos1 in range(len(umi)):\n", " for b1 in bases :\n", " umi_mut = umi[:pos1] + b1 + umi[pos1+1:]\n", " if umi_mut in umi_dict[cb_correct][g] :\n", " umi_visited = True\n", " umi_dict[cb_correct][g][umi_mut] += 1\n", " break #don't check every other UMI mutant\n", " if umi_visited : \n", " break #don't check every other UMI mutant\n", "\n", " #if we've encounted this UMI before for this guide/cell, \n", " #then we skip this read\n", " if umi_visited: #count number of times we skip a read \n", " skips += 1\n", " continue \n", " #if it's a new UMI, increment count matrix appropriately\n", " else: \n", " not_skips += 1\n", " umi_dict[cb_correct][g][umi] = 1\n", " guide_m[barcode_dict[cb], gdict[gseq]] += 1.\n", " else:\n", " continue" ] }, { "cell_type": "code", "execution_count": null, "id": "737ad7dc-de4f-4254-ab02-a120480351df", "metadata": {}, "outputs": [], "source": [ "print(guide_m.sum())\n", "#print(guide_reads)" ] }, { "cell_type": "code", "execution_count": null, "id": "cd01fb6e-e519-43d3-bafa-cd0a7b95046e", "metadata": {}, "outputs": [], "source": [ "#save ADT matrix\n", "np.savetxt('adt_counts' + '.txt', guide_m, fmt='%s')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e11ed428-7b62-40b6-912e-2219e5b51e6c", "metadata": {}, "outputs": [], "source": [ "#inputs for OligoHash Quantification - again, the variables are the same names, but we have to reenter them so they point to the write place\n", "bcs = np.loadtxt(\"cb.txt\", dtype=str) #load cell barcodes \n", "guides = np.loadtxt(\"oligohash.txt\", dtype=str) #load guides \n", "guide_seq = guides[:,1] #extract sequences of each guides\n", "guide_seq = np.char.upper(guide_seq) #convert sequences to upper case\n", "guides = guides[:,0] #extract guide name\n", "print(guide_seq)\n", "print(guides)\n", "guide_cr = \"GCTTTAAGGCCGGTCCTAGCAA\" #constant region to look for\n", "umi_n_muts = 1 #how many mutations to allow when classifying a read as a unique molecule\n", "\n", "r1_file = \"R1.fastq.gz\" #put path to your R1 file here\n", "r2_file = \"R2.fastq.gz\" #put path to your R2 file here\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b41d4c6d-1363-46b1-8190-27d2417525c2", "metadata": {}, "outputs": [], "source": [ "#Build dictionary of cell barcodes (single-mutation support)\n", "#the keys are equal to all cell barcodes (and their 1bp mutants, assuming this doesn't collide with \n", "#another real barcode\n", "bases = ['A', 'C', 'G', 'T'] \n", "barcode_dict = {}\n", "sequences = []\n", "for i in range(len(bcs)):\n", " bc = bcs[i]\n", " barcode_dict[bc] = i\n", " for pos1 in range(len(bc)) :\n", " for b1 in bases:\n", " bc_mut = bc[:pos1] + b1 + bc[pos1+1:]\n", " if bc_mut in barcode_dict and barcode_dict[bc_mut] != i :\n", " del barcode_dict[bc_mut] \n", " else:\n", " barcode_dict[bc_mut] = i" ] }, { "cell_type": "code", "execution_count": null, "id": "45b86823-7040-4f6f-9247-fdb38a0ee806", "metadata": {}, "outputs": [], "source": [ "#build dictionary of guides\n", "#similar structure where the keys are all sequences within 1bp edits of guide, \n", "#and values correspond to index of original guide sequence\n", "gdict = {}\n", "sequences = []\n", "#for i in range(1):\n", "for i in range(len(guide_seq)):\n", " g = guide_seq[i]\n", " gdict[g] = i\n", " for pos1 in range(len(g)):\n", " for b1 in bases:\n", " g_mut = g[:pos1] + b1 + g[pos1+1:]\n", " if g_mut in gdict and gdict[g_mut] != i:\n", " print(\"[ERROR] Barcode dictionary collision.\")\n", " continue\n", " else: \n", " #print(g_mut)\n", " gdict[g_mut] = i\n", " #print(g_mut)\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "46ed814a-e319-4339-9716-7803f49291f1", "metadata": {}, "outputs": [], "source": [ "#initialize CB x guides matrix\n", "guide_m = np.zeros((len(bcs), len(guide_seq))) \n", "\n", "#make dict to keep track of what umis have been seen for each CB\n", "umi_dict = {}\n", "for bc in bcs:\n", " umi_dict[bc] = {}\n", " \n", "r1 = gzip.open(r1_file)\n", "r2 = gzip.open(r2_file)\n", "\n", "#initialize umi counting\n", "total_line = 0\n", "skips = 0 \n", "not_skips = 0\n", "guide_reads = 0\n", "\n", "# while total_line < 100000: \n", "# recommend this for debugging if you want to just run a subset of lines\n", "while True : #comment this out if you dont' want to process the entire file\n", " read_name = r2.readline()\n", " if len(read_name) == 0: \n", " break\n", " seq = r2.readline().strip()\n", " seq = seq.decode('utf8')\n", " strand = r2.readline().strip()\n", " quality = r2.readline().strip()\n", "\n", " read1_name = r1.readline().strip()\n", " read1_seq = r1.readline().strip()\n", " read1_seq = read1_seq.decode('utf8')\n", " read1_strand = r1.readline().strip()\n", " read1_quality = r1.readline().strip()\n", "\n", " total_line += 1\n", " if total_line % 1000000 == 0 :\n", " print(\"Processing read \" + str(total_line) + \"...\")\n", "\n", " #check the hamming distance between the constant region and the beginning of the read\n", " hamming_dist = _hamming(seq[15:37],guide_cr)\n", " if hamming_dist >3:\n", " continue\n", " else: \n", " guide_reads +=1\n", " gseq = seq[0:15]\n", " #print(guide_seq)\n", " #print(len(guide_seq))\n", "\n", " #now check which guide matches sequence\n", " if gseq in gdict:\n", " g = guides[gdict[gseq]] #get guide identity allowing 1bp mismatch\n", " #correct cell barcode\n", " cb = read1_seq[0:16] \n", " if cb in barcode_dict:\n", " cb_correct = bcs[barcode_dict[cb]]\n", " #now check UMI\n", " umi = read1_seq[16:16+12] \n", " #add the guide to the umi dictionary\n", " if g not in umi_dict[cb_correct] :\n", " umi_dict[cb_correct][g] = {}\n", " \n", " #increment the count if that UMI has been seen for that barcode\n", " umi_visited = False\n", " if umi in umi_dict[cb_correct][g]: \n", " umi_visited = True\n", " umi_dict[cb_correct][g][umi] += 1\n", " #now check mutations \n", " if umi_n_muts == 1 and not umi_visited:\n", " for pos1 in range(len(umi)):\n", " for b1 in bases :\n", " umi_mut = umi[:pos1] + b1 + umi[pos1+1:]\n", " if umi_mut in umi_dict[cb_correct][g] :\n", " umi_visited = True\n", " umi_dict[cb_correct][g][umi_mut] += 1\n", " break #don't check every other UMI mutant\n", " if umi_visited : \n", " break #don't check every other UMI mutant\n", "\n", " #if we've encounted this UMI before for this guide/cell, \n", " #then we skip this read\n", " if umi_visited: #count number of times we skip a read \n", " skips += 1\n", " continue \n", " #if it's a new UMI, increment count matrix appropriately\n", " else: \n", " not_skips += 1\n", " umi_dict[cb_correct][g][umi] = 1\n", " guide_m[barcode_dict[cb], gdict[gseq]] += 1.\n", " else:\n", " continue\n" ] }, { "cell_type": "code", "execution_count": null, "id": "188aa755-b6a8-455f-983a-bf98b9e460c7", "metadata": {}, "outputs": [], "source": [ "print(guide_m.sum())\n", "#print(guide_reads)" ] }, { "cell_type": "code", "execution_count": null, "id": "dc748d85-b009-47b6-9a64-32176f9fa642", "metadata": {}, "outputs": [], "source": [ "#save OligoHash matrix\n", "np.savetxt('OligoHash_counts'+'.txt', guide_m, fmt='%s')" ] }, { "cell_type": "code", "execution_count": null, "id": "1cacf3e2-f3c2-4c31-8e95-46c07a2374a5", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 5 }