Append sucessive lists two by two

Hi,

I am biologist and decided to learn Python to automate my day-to-day tasks and manage large datasets.

I am currently working on a small script to extract data and format them for analysis.

Here is my initial file containing my data (here is a small example how data are formatted):

CruSTS5_GC_30000	AUGUSTUS	gene	3665	6720	0.29	-	.	g1
CruSTS5_GC_30000	AUGUSTUS	transcript	3665	6720	0.29	-	.	g1.t1
CruSTS5_GC_30000	AUGUSTUS	stop_codon	3665	3667	.	-	0	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	terminal	3665	3755	0.98	-	1	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	internal	3810	5746	0.84	-	0	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	internal	5801	5936	0.96	-	1	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	internal	5991	6137	0.51	-	1	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	internal	6196	6295	0.86	-	2	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	internal	6349	6443	0.83	-	1	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	internal	6503	6615	0.9	-	0	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	initial	6670	6720	0.98	-	0	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	intron	3756	3809	0.99	-	.	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	intron	5747	5800	1	-	.	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	intron	5937	5990	0.94	-	.	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	intron	6138	6195	0.8	-	.	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	intron	6296	6348	1	-	.	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	intron	6444	6502	0.84	-	.	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	intron	6616	6669	0.99	-	.	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	CDS	3668	3755	0.98	-	1	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	CDS	3810	5746	0.84	-	0	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	CDS	5801	5936	0.96	-	1	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	CDS	5991	6137	0.51	-	1	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	CDS	6196	6295	0.86	-	2	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	CDS	6349	6443	0.83	-	1	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	CDS	6503	6615	0.9	-	0	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	CDS	6670	6720	0.98	-	0	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	start_codon	6718	6720	.	-	0	transcript_id "g1.t1"; gene_id "g1";
CruSTS5_GC_30000	AUGUSTUS	gene	7534	8101	0.6	+	.	g2
CruSTS5_GC_30000	AUGUSTUS	transcript	7534	8101	0.6	+	.	g2.t1
CruSTS5_GC_30000	AUGUSTUS	start_codon	7534	7536	.	+	0	transcript_id "g2.t1"; gene_id "g2";
CruSTS5_GC_30000	AUGUSTUS	initial	7534	7596	0.84	+	0	transcript_id "g2.t1"; gene_id "g2";
CruSTS5_GC_30000	AUGUSTUS	internal	7647	7770	0.72	+	0	transcript_id "g2.t1"; gene_id "g2";
CruSTS5_GC_30000	AUGUSTUS	terminal	7827	8101	0.63	+	2	transcript_id "g2.t1"; gene_id "g2";
CruSTS5_GC_30000	AUGUSTUS	intron	7597	7646	0.84	+	.	transcript_id "g2.t1"; gene_id "g2";
CruSTS5_GC_30000	AUGUSTUS	intron	7771	7826	0.72	+	.	transcript_id "g2.t1"; gene_id "g2";
CruSTS5_GC_30000	AUGUSTUS	CDS	7534	7596	0.84	+	0	transcript_id "g2.t1"; gene_id "g2";
CruSTS5_GC_30000	AUGUSTUS	CDS	7647	7770	0.72	+	0	transcript_id "g2.t1"; gene_id "g2";
CruSTS5_GC_30000	AUGUSTUS	CDS	7827	8098	0.63	+	2	transcript_id "g2.t1"; gene_id "g2";
CruSTS5_GC_30000	AUGUSTUS	stop_codon	8099	8101	.	+	0	transcript_id "g2.t1"; gene_id "g2";
CruSTS5_GC_30000	AUGUSTUS	gene	8621	12107	0.15	-	.	g3
CruSTS5_GC_30000	AUGUSTUS	transcript	8621	12107	0.12	-	.	g3.t1
CruSTS5_GC_30000	AUGUSTUS	stop_codon	8621	8623	.	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	terminal	8621	8878	1	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	8936	8996	1	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9056	9219	1	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9277	9292	0.6	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9352	9357	0.6	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9413	9634	0.91	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9684	9739	0.97	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9791	9871	0.86	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9929	10056	0.99	-	2	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	10114	10355	0.76	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	10928	11527	0.58	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	11595	11770	0.98	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	11824	11979	0.98	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	initial	12033	12107	0.59	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	8879	8935	1	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	8997	9055	1	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	9220	9276	1	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	9293	9351	0.48	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	9358	9412	0.6	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	9635	9683	0.97	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	9740	9790	0.86	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	9872	9928	0.95	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	10057	10113	0.99	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	10356	10927	0.54	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	11528	11594	0.99	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	11771	11823	1	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	11980	12032	0.99	-	.	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	8624	8878	1	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	8936	8996	1	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9056	9219	1	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9277	9292	0.6	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9352	9357	0.6	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9413	9634	0.91	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9684	9739	0.97	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9791	9871	0.86	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9929	10056	0.99	-	2	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	10114	10355	0.76	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	10928	11527	0.58	-	1	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	11595	11770	0.98	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	11824	11979	0.98	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	12033	12107	0.59	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	start_codon	12105	12107	.	-	0	transcript_id "g3.t1"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	transcript	8621	12107	0.03	-	.	g3.t2
CruSTS5_GC_30000	AUGUSTUS	stop_codon	8621	8623	.	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	terminal	8621	8878	1	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	8936	8996	1	-	1	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9056	9219	1	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9277	9310	0.32	-	1	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9413	9634	0.91	-	1	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9684	9739	0.97	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9791	9871	0.86	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	9929	10056	0.99	-	2	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	10114	10355	0.76	-	1	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	10928	11527	0.58	-	1	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	11595	11770	0.98	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	internal	11824	11979	0.98	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	initial	12033	12107	0.59	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	8879	8935	1	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	8997	9055	1	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	9220	9276	1	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	9311	9412	0.2	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	9635	9683	0.97	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	9740	9790	0.86	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	9872	9928	0.95	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	10057	10113	0.99	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	10356	10927	0.54	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	11528	11594	0.99	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	11771	11823	1	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	intron	11980	12032	0.99	-	.	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	8624	8878	1	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	8936	8996	1	-	1	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9056	9219	1	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9277	9310	0.32	-	1	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9413	9634	0.91	-	1	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9684	9739	0.97	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9791	9871	0.86	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	9929	10056	0.99	-	2	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	10114	10355	0.76	-	1	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	10928	11527	0.58	-	1	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	11595	11770	0.98	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	11824	11979	0.98	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	CDS	12033	12107	0.59	-	0	transcript_id "g3.t2"; gene_id "g3";
CruSTS5_GC_30000	AUGUSTUS	start_codon	12105	12107	.	-	0	transcript_id "g3.t2"; gene_id "g3";
...

What I want to do, is to extract the start position and stop position only, discard non essential data and obtain something like this

['stop_codon', '3665', '3667', 'g1.t1']
['start_codon', '6718', '6720', 'g1.t1']
['start_codon', '7534', '7536', 'g2.t1']
['stop_codon', '8099', '8101', 'g2.t1']
['stop_codon', '8621', '8623', 'g3.t1']
['start_codon', '12105', '12107', 'g3.t1']
['stop_codon', '8621', '8623', 'g3.t2']
['start_codon', '12105', '12107', 'g3.t2']
...

I could successfully complete this using the following script:

import re
from operator import itemgetter

DataFile = "/mnt/SSD2/Python/Gene Cluster/CruSTS5Cop.gtf"

#Definition

def Status(Lines):
    global StartLines, StopLines
    if "start_codon" in Lines:
        StartLines = Lines
        #print(StartLines[2]) #To print value index 2
        print(StartLines)
    else:
        StopLines = Lines
        print(StopLines)

#Begining of the script
with open (DataFile, "r") as GCData:
    Data = GCData.readlines()
        
    for Lines in Data:
        Lines = Lines.strip() #Remove return to line at the end
        Lines = re.sub("\s+", "\t", Lines) #Replace multiple spaces by a tabulation
        Lines = re.sub(";", "", Lines) #Replace ; by nothing ("")
        Lines = re.sub('"', "", Lines) #Replace " by nothing ("")

        if re.findall(("start_codon|stop_codon"), Lines): #sorting using "|" as "or"
            Lines = Lines.split("\t") #Convert string to list
            IndexToKeep = [2, 3, 4, 9] #List of index to keep
            SingleLines = [Index for Index in Lines if Lines.index(Index) in IndexToKeep]

            Status(SingleLines)

Now, the next step for which I am asking your help, is to append the successive lists two by two to get something like this, match a start/stop with its successive stop/start list.

['stop_codon', '3665', '3667', 'g1.t1', 'start_codon', '6718', '6720', 'g1.t1']
['start_codon', '7534', '7536', 'g2.t1', 'stop_codon', '8099', '8101', 'g2.t1']
['stop_codon', '8621', '8623', 'g3.t1', 'start_codon', '12105', '12107', 'g3.t1']
['stop_codon', '8621', '8623', 'g3.t2', 'start_codon', '12105', '12107', 'g3.t2']

Any idea how I could obtain something like that?

My idea was to consider a counter and append the first list with the second list, then reset the counter and continue in a loop but it is a little bit difficult to me to code this potential solution.

If you have any other solutions, they are welcome.

Thank you in advance for your help.

Hi,
great, genetics! Here is a solution with a generator (implemented in the short form - as a generator expression).

Input data:
codon_boundaries = [
    ['stop_codon', '3665', '3667', 'g1.t1'],
    ['start_codon', '6718', '6720', 'g1.t1'],
    ['start_codon', '7534', '7536', 'g2.t1'],
    ['stop_codon', '8099', '8101', 'g2.t1'],
    ['stop_codon', '8621', '8623', 'g3.t1'],
    ['start_codon', '12105', '12107', 'g3.t1'],
    ['stop_codon', '8621', '8623', 'g3.t2'],
    ['start_codon', '12105', '12107', 'g3.t2'],
]

Processing:

codon_boundaries_iter = iter(codon_boundaries)  # iterator needed for explicit next() calls
codon_pairs = (
    boundary1 + next(codon_boundaries_iter)     # concatenate the boundary with the next one
    for boundary1 in codon_boundaries_iter)     # iterating over boundaries

print(*codon_pairs, sep='\n')                   # print items separated by newlines
Text output:
['stop_codon', '3665', '3667', 'g1.t1', 'start_codon', '6718', '6720', 'g1.t1']
['start_codon', '7534', '7536', 'g2.t1', 'stop_codon', '8099', '8101', 'g2.t1']
['stop_codon', '8621', '8623', 'g3.t1', 'start_codon', '12105', '12107', 'g3.t1']
['stop_codon', '8621', '8623', 'g3.t2', 'start_codon', '12105', '12107', 'g3.t2']

The advantage of generators is that you can process huge amounts of data without high requirements on the computer’s operational memory. …but attention, you can iterate the generator just once. After the print the codon_pairs generator will be empty.

If you want to materialize the items from a generator to a list in memory and spoil the generator’s advantage, just do codon_pairs_list = list(codon_pairs) or replace the round brackets around the generator expression for square brackets.

Note: The print expression with *codon_pairs spoils the memory efficiency. A loop is needed to keep the advantage.

Note2: I have only a very basic idea of the genetic code. I just guessed that the start and stop codons or their places can be called codon_boundaries. Of course change the variable names to the most descriptive ones.

Hello and welcome, Francois!

Thank you for the exercise. This code…

  1. makes separate lists for the first and second lines of all couplets, then…
  2. takes the first element from each line of the couplet, and…
  3. checks the first element of the first member of the couplet
    …to see if it is a “Start” leading or “Stop” leading pair.

After that, we concatenate the lines (in Start>Stop order) into a single list element and add the merged couplet to the output list.

CodonList = [['stop_codon', '3665', '3667', 'g1.t1']
            ,['start_codon', '6718', '6720', 'g1.t1']
            ,['start_codon', '7534', '7536', 'g2.t1']
            ,['stop_codon', '8099', '8101', 'g2.t1']]

SingleLines = []
EvenLines = CodonList[0::2]
OddLines = CodonList[1::2]
for Line1,Line2 in zip(EvenLines,OddLines):
    if str(Line1[0]).startswith('start'):
        SingleLines.append(Line1+Line2)
    else:
        SingleLines.append(Line2+Line1)
for item in SingleLines: print(item)

This is the output it gives (NOTE: the last two couplets are duplicates; I changed the very last one to be unique):

['start_codon', '6718', '6720', 'g1.t1', 'stop_codon', '3665', '3667', 'g1.t1']
['start_codon', '7534', '7536', 'g2.t1', 'stop_codon', '8099', '8101', 'g2.t1']  
['start_codon', '12105', '12107', 'g3.t1', 'stop_codon', '8621', '8623', 'g3.t1']
['start_codon', '11111', '22222', 'g3.t2', 'stop_codon', '9999', '8888', 'g3.t2']
And here's an expanded version (CLICK to unfold)
CodonList = [['stop_codon', '3665', '3667', 'g1.t1']
            ,['start_codon', '6718', '6720', 'g1.t1']
            ,['start_codon', '7534', '7536', 'g2.t1']
            ,['stop_codon', '8099', '8101', 'g2.t1']
            ,['stop_codon', '8621', '8623', 'g3.t1']
            ,['start_codon', '12105', '12107', 'g3.t1']
            ,['stop_codon', '9999', '8888', 'g3.t2']
            ,['start_codon', '11111', '22222', 'g3.t2']]

ShortLine = []
EvenLines = CodonList[0::2]
OddLines = CodonList[1::2]
print("EvenLines:\n",EvenLines)
print("OddLines:\n",OddLines)
print('-------------------')
for Line1,Line2 in zip(EvenLines,OddLines):
    print("Line1:",Line1)
    print("Line2:",Line2)
    if str(Line1[0]).startswith('start'):
        print("Line1FIRST:\n          ",Line1,Line2)
        ShortLine.append(Line1+Line2)
    else:
        print("Line2FIRST:\n"+" "*10,Line2,Line1)   #same as Line1 code but repeats the space character
        ShortLine.append(Line2+Line1)
    print('-------------------')

print("ShortLine")
for item in ShortLine: print(item)