| Home | Trees | Indices | Help |
|
|---|
|
|
1 __VERSION__="ete2-2.0rev104"
2 # #START_LICENSE###########################################################
3 #
4 # Copyright (C) 2009 by Jaime Huerta Cepas. All rights reserved.
5 # email: jhcepas@gmail.com
6 #
7 # This file is part of the Environment for Tree Exploration program (ETE).
8 # http://ete.cgenomics.org
9 #
10 # ETE is free software: you can redistribute it and/or modify it
11 # under the terms of the GNU General Public License as published by
12 # the Free Software Foundation, either version 3 of the License, or
13 # (at your option) any later version.
14 #
15 # ETE is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the GNU General Public License
21 # along with ETE. If not, see <http://www.gnu.org/licenses/>.
22 #
23 # #END_LICENSE#############################################################
24
25 import re
26 import os
27
28 __all__ = ["read_newick", "write_newick", "print_supported_formats"]
29
30 # Regular expressions used for reading newick format
31 _ILEGAL_NEWICK_CHARS = ":;(),\[\]\t\n\r="
32 _NHX_RE = "\[&&NHX:[^\]]*\]"
33 _FLOAT_RE = "[+-]?\d+\.?\d*"
34 _NAME_RE = "[^():,;\[\]]+"
35
36 DEFAULT_DIST = 1.0
37 DEFAULT_NAME = ''
38 DEFAULT_SUPPORT = 1.0
39
40
41 # Allowed formats. This table is used to read and write newick using
42 # different convenctions. You can also add your own formats in an easy way.
43 #
44 #
45 # FORMAT: [[LeafAttr1, LeafAttr1Type, Strict?], [LeafAttr2, LeafAttr2Type, Strict?],\
46 # [InternalAttr1, InternalAttr1Type, Strict?], [InternalAttr2, InternalAttr2Type, Strict?]]
47 #
48 # Attributes are placed in the newick as follows:
49 #
50 # .... ,LeafAttr1:LeafAttr2)InternalAttr1:InternalAttr2 ...
51 #
52 #
53 # /-A
54 # -NoName--|
55 # | /-B
56 # \C-------|
57 # | /-D
58 # \E-------|
59 # \-G
60 #
61 # Format 0 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737);
62 # Format 1 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);
63 # Format 2 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737);
64 # Format 3 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);
65 # Format 4 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)));
66 # Format 5 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729):0.642905):0.567737);
67 # Format 6 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E)C);
68 # Format 7 = (A,(B,(D,G)E)C);
69 # Format 8 = (A,(B,(D,G)));
70 # Format 9 = (,(,(,)));
71
72 NW_FORMAT = {
73 0: [['name', str, True], ["dist", float, True], ['support', float, True], ["dist", float, True]], # Flexible with support
74 1: [['name', str, True], ["dist", float, True], ['name', str, True], ["dist", float, True]], # Flexible with internal node names
75 2: [['name', str, False], ["dist", float, False], ['support', float, False], ["dist", float, False]],# Strict with support values
76 3: [['name', str, False], ["dist", float, False], ['name', str, False], ["dist", float, False]], # Strict with internal node names
77 4: [['name', str, False], ["dist", float, False], [None, None, False], [None, None, False]],
78 5: [['name', str, False], ["dist", float, False], [None, None, False], ["dist", float, False]],
79 6: [['name', str, False], [None, None, False], [None, None, False], ["dist", float, False]],
80 7: [['name', str, False], ["dist", float, False], ["name", str, False], [None, None, False]],
81 8: [['name', str, False], [None, None, False], ["name", str, False], [None, None, False]],
82 9: [['name', str, False], [None, None, False], [None, None, False], [None, None, False]], # Only topology with node names
83 100: [[None, None, False], [None, None, False], [None, None, False], [None, None, False]] # Only Topology
84 }
85
86
88 if node_type == "leaf":
89 container1 = NW_FORMAT[format][0][0]
90 container2 = NW_FORMAT[format][1][0]
91 converterFn1 = NW_FORMAT[format][0][1]
92 converterFn2 = NW_FORMAT[format][1][1]
93 else:
94 container1 = NW_FORMAT[format][2][0]
95 container2 = NW_FORMAT[format][3][0]
96 converterFn1 = NW_FORMAT[format][2][1]
97 converterFn2 = NW_FORMAT[format][3][1]
98
99 if converterFn1 == str:
100 try:
101 FIRST_PART = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
102 str(getattr(node, container1)))
103 except (AttributeError, TypeError):
104 FIRST_PART = "?"
105
106 elif converterFn1 is None:
107 FIRST_PART = ""
108 else:
109 try:
110 FIRST_PART = "%0.6f" %(converterFn2(getattr(node, container1)))
111 except (ValueError, TypeError):
112 FIRST_PART = "?"
113
114
115 if converterFn2 == str:
116 try:
117 SECOND_PART = ":"+re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
118 str(getattr(node, container2)))
119 except (ValueError, TypeError):
120 SECOND_PART = ":?"
121 elif converterFn2 is None:
122 SECOND_PART = ""
123 else:
124 try:
125 SECOND_PART = ":%0.6f" %(converterFn2(getattr(node, container2)))
126 except (ValueError, TypeError):
127 SECOND_PART = ":?"
128
129 return "%s%s" %(FIRST_PART, SECOND_PART)
130
131 # Used to write into specific formats
133 safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
134 str(getattr(node, "name")))
135
136 if format == 0 or format == 1 or format == 2 or format ==3:
137 return "%s:%0.6f" %(safe_name, node.dist)
138 elif format == 4 or format == 7:
139 return ":%0.6f" %(node.dist)
140 elif format == 5 or format == 6:
141 return "%s" %(safe_name)
142
144 safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
145 str(getattr(node, "name")))
146 if format == 0 or format == 1:
147 return "%0.6f:%0.6f" %(node.support, node.dist)
148 elif format == 2:
149 return "%s:%0.6f" %(safe_name, node.dist)
150 elif format == 3 or format == 4:
151 return ":%0.6f" %(node.dist)
152 elif format == 5:
153 return "%s" %(safe_name)
154 elif format == 6 or format == 7:
155 return ""
156
158 from ete2.coretype.tree import TreeNode
159 t = TreeNode()
160 t.populate(4, "ABCDEFGHI")
161 print t
162 for f in NW_FORMAT:
163 print "Format", f,"=", write_newick(t, features=None, format=f)
164
168
170 """ Reads a newick tree from either a string or a file, and returns
171 an ETE tree structure.
172
173 A previously existent node object can be passed as the root of the
174 tree, which means that all its new children will belong to the same
175 class as the root(This allows to work with custom TreeNode
176 objects).
177
178 You can also take advantage from this behaviour to concatenate
179 several tree structures.
180 """
181
182 if root_node is None:
183 from ete2.coretype.tree import TreeNode
184 root_node = TreeNode()
185
186 if type(newick) == str:
187
188 if os.path.exists(newick):
189 nw = open(newick, 'rU').read()
190 else:
191 nw = newick
192 nw = nw.strip()
193 if not nw.startswith('(') or not nw.endswith(';'):
194 raise NewickError, \
195 'Unexisting tree file or Malformed newick tree structure.'
196 return _read_newick_from_string(nw, root_node, format)
197 else:
198 raise NewickError, \
199 "'newick' argument must be either a filename or a newick string."
200
202 """ Reads a newick string in the New Hampshire format. """
203
204 if nw.count('(') != nw.count(')'):
205 raise NewickError, 'Parentheses do not match. Broken tree structure'
206
207 # white spaces and separators are removed
208 nw = re.sub("\n", "", nw)
209 nw = re.sub("\r", "", nw)
210 nw = re.sub("\t", "", nw)
211
212 current_parent = None
213
214
215 # Ok, this is my own way of reading newick structures. I find it
216 # more flexible and elegant than other docummented methods. Don't
217 # know if I'm loosing much efficiency. It Starts by splitting the
218 # structure using open parentheses. Each of the resulting chunks
219 # represent an internal node. So for each chunk I create a new node
220 # that hungs from the current parent node. Each internal node chunk
221 # may contain information about terminal nodes hanging from the
222 # internal and clossing parenthessis (closing previously opened
223 # internal nodes).
224 #
225 # Enjoy.
226 # by JHC ;)
227
228 # Skip the first chunk. It is always == ''
229 for internal_node in nw.split("(")[1:]:
230 # If this is the root of tree, use the root_node instead of
231 # creating it, otherwise make a new one.
232 if current_parent is None:
233 current_parent = root_node
234 else:
235 current_parent = current_parent.add_child()
236 # We can only find leaf nodes within this chunk, since rest of
237 # internal nodes will be in the next newick chunks
238 possible_leaves = internal_node.split(",")
239 for i, leaf in enumerate(possible_leaves):
240 # Any resulting sub-chunk resulting from splitting by commas can
241 # be considered (tpologically) as a child to the current parent
242 # node. We only discard chunks if they are empty and in the last
243 # possition, meaining that the next brother is not terminal bu
244 # internal node (will be visited in the next newick chunk)
245 if leaf.strip() == '' and i == len(possible_leaves)-1:
246 continue
247 # Leaf text strings may end with a variable number of clossing
248 # parenthesis. For each ')' we read the information of the
249 # current node, close it and go up one more node.
250 clossing_nodes = leaf.split(")")
251 # first par contain leaf info
252 _read_node_data(clossing_nodes[0], current_parent, "leaf", format)
253 # The next parts containg clossing nodes and info about the
254 # internal nodes.
255 if len(clossing_nodes)>1:
256 for closing_internal in clossing_nodes[1:]:
257 if closing_internal.strip() ==";": continue
258 _read_node_data(closing_internal, current_parent, "internal", format)
259 current_parent = current_parent.up
260 return root_node
261
263 """ Reads node's extra data form its NHX string. NHX uses this
264 format: [&&NHX:prop1=value1:prop2=value2] """
265 NHX_string = NHX_string.replace("[&&NHX:", "")
266 NHX_string = NHX_string.replace("]", "")
267 for field in NHX_string.split(":"):
268 try:
269 pname, pvalue = field.split("=")
270 except ValueError, e:
271 print NHX_string, field.split("=")
272 raise ValueError, e
273 node.add_feature(pname, pvalue)
274
276 """ Reads a leaf node from a subpart of the original newick
277 tree """
278
279 if node_type == "leaf":
280 node = current_node.add_child()
281 container1 = NW_FORMAT[format][0][0]
282 container2 = NW_FORMAT[format][1][0]
283 converterFn1 = NW_FORMAT[format][0][1]
284 converterFn2 = NW_FORMAT[format][1][1]
285 flexible1 = NW_FORMAT[format][0][2]
286 flexible2 = NW_FORMAT[format][1][2]
287 else:
288 node = current_node
289 container1 = NW_FORMAT[format][2][0]
290 container2 = NW_FORMAT[format][3][0]
291 converterFn1 = NW_FORMAT[format][2][1]
292 converterFn2 = NW_FORMAT[format][3][1]
293 flexible1 = NW_FORMAT[format][2][2]
294 flexible2 = NW_FORMAT[format][3][2]
295
296 if converterFn1 == str:
297 FIRST_MATCH = "("+_NAME_RE+")"
298 elif converterFn1 == float:
299 FIRST_MATCH = "("+_FLOAT_RE+")"
300 elif converterFn1 is None:
301 FIRST_MATCH = '()'
302
303 if converterFn2 == str:
304 SECOND_MATCH = "(:"+_NAME_RE+")"
305 elif converterFn2 == float:
306 SECOND_MATCH = "(:"+_FLOAT_RE+")"
307 elif converterFn2 is None:
308 SECOND_MATCH = '()'
309
310 if flexible1:
311 FIRST_MATCH += "?"
312 if flexible2:
313 SECOND_MATCH += "?"
314
315 MATCH = '%s\s*%s\s*(%s)?' % (FIRST_MATCH, SECOND_MATCH, _NHX_RE)
316 data = re.match(MATCH, subnw)
317 if data:
318 data = data.groups()
319 if data[0] is not None and data[0] != '':
320 node.add_feature(container1, converterFn1(data[0].strip()))
321
322 if data[1] is not None and data[1] != '':
323 node.add_feature(container2, converterFn2(data[1][1:].strip()))
324
325 if data[2] is not None \
326 and data[2].startswith("[&&NHX"):
327 _parse_extra_features(node, data[2])
328 else:
329 raise NewickError, "Unexpected leaf node format:\n\t"+ subnw[0:50]
330 return
331
333 """ Recursively reads a tree structure and returns its NHX
334 representation. """
335 newick = ""
336 if not node.children:
337 safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
338 str(getattr(node, "name")))
339
340 newick += format_node(node, "leaf", format)
341 newick += _get_features_string(node, features)
342 return newick
343 else:
344 if node.children:
345 newick+= "("
346 for cnode in node.children:
347 newick += write_newick(cnode, features, format=format,\
348 _is_root = False)
349 # After last child is processed, add closing string
350 if cnode == node.children[-1]:
351 newick += ")"
352 if node.up is not None:
353 newick += format_node(node, "internal", format)
354 newick += _get_features_string(node, features)
355 else:
356 newick += ','
357 if _is_root:
358 newick += ";"
359 return newick
360
361
363 """ Generates the extended newick string NHX with extra data about
364 a node. """
365 string = ""
366 if features is None:
367 features = []
368 elif features == []:
369 features = self.features
370
371 for pr in features:
372 if hasattr(self, pr):
373 value = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
374 str(getattr(self, pr)))
375 if string != "":
376 string +=":"
377 string +="%s=%s" %(pr, str(value))
378 if string != "":
379 string = "[&&NHX:"+string+"]"
380
381 return string
382
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Fri Jul 2 11:17:27 2010 | http://epydoc.sourceforge.net |