Home

html_matcher.py

Download
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#!/usr/bin/env python
# -*- coding: utf-8 -*-

'''
Context-independent xHTML pair matcher
Use method <code>match(html, start_ix)</code> to find matching pair.
If pair was found, this function returns a list of indexes where tag pair 
starts and ends. If pair wasn't found, <code>None</code> will be returned.

The last matched (or unmatched) result is saved in <code>last_match</code> 
dictionary for later use.

@author: Sergey Chikuyonok (serge.che@gmail.com)
'''
import re

start_tag = r'<([w:-]+)((?:s+[w-:]+(?:s*=s*(?:(?:"[^"]*")|(?:'[^']*')|[^>s]+))?)*)s*(/?)>'
end_tag = r'</([w:-]+)[^>]*>'
attr = r'([w-:]+)(?:s*=s*(?:(?:"((?:\.|[^"])*)")|(?:'((?:\.|[^'])*)')|([^>s]+)))?'

"Last matched HTML pair"
last_match = {
	'opening_tag': None, # Tag() or Comment() object
	'closing_tag': None, # Tag() or Comment() object
	'start_ix': -1,
	'end_ix': -1
}

cur_mode = 'xhtml'
"Current matching mode"

def set_mode(new_mode):
	global cur_mode
	if new_mode != 'html': new_mode = 'xhtml'
	cur_mode = new_mode

def make_map(elems):
	"""
	Create dictionary of elements for faster searching
	@param elems: Elements, separated by comma
	@type elems: str
	"""
	obj = {}
	for elem in elems.split(','):
			obj[elem] = True

	return obj

# Empty Elements - HTML 4.01
empty = make_map("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed");

# Block Elements - HTML 4.01
block = make_map("address,applet,blockquote,button,center,dd,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");

# Inline Elements - HTML 4.01
inline = make_map("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");

# Elements that you can, intentionally, leave open
# (and which close themselves)
close_self = make_map("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");

# Attributes that have their values filled in disabled="disabled"
fill_attrs = make_map("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");

#Special Elements (can contain anything)
# serge.che: parsing data inside <scipt> elements is a "feature"
special = make_map("style");

class Tag():
	"""Matched tag"""
	def __init__(self, match, ix):
		"""
		@type match: MatchObject
		@param match: Matched HTML tag
		@type ix: int
		@param ix: Tag's position
		"""
		global cur_mode
		
		name = match.group(1).lower()
		self.name = name
		self.full_tag = match.group(0)
		self.start = ix
		self.end = ix + len(self.full_tag)
		self.unary = ( len(match.groups()) > 2 and bool(match.group(3)) ) or (name in empty and cur_mode == 'html')
		self.type = 'tag'
		self.close_self = (name in close_self and cur_mode == 'html')

class Comment():
	"Matched comment"
	def __init__(self, start, end):
		self.start = start
		self.end = end
		self.type = 'comment'

def make_range(opening_tag=None, closing_tag=None, ix=0):
	"""
	Makes selection ranges for matched tag pair
	@type opening_tag: Tag
    @type closing_tag: Tag
    @type ix: int
    @return list
	"""
	start_ix, end_ix = -1, -1
	
	if opening_tag and not closing_tag: # unary element
		start_ix = opening_tag.start
		end_ix = opening_tag.end
	elif opening_tag and closing_tag: # complete element
		if (opening_tag.start < ix and opening_tag.end > ix) or (closing_tag.start <= ix and closing_tag.end > ix):
			start_ix = opening_tag.start
			end_ix = closing_tag.end;
		else:
			start_ix = opening_tag.end
			end_ix = closing_tag.start
	
	return start_ix, end_ix

def save_match(opening_tag=None, closing_tag=None, ix=0):
	"""
	Save matched tag for later use and return found indexes
    @type opening_tag: Tag
    @type closing_tag: Tag
    @type ix: int
    @return list
	"""
	last_match['opening_tag'] = opening_tag; 
	last_match['closing_tag'] = closing_tag;
	
	last_match['start_ix'], last_match['end_ix'] = make_range(opening_tag, closing_tag, ix)
	
	return last_match['start_ix'] != -1 and (last_match['start_ix'], last_match['end_ix']) or (None, None)

def match(html, start_ix, mode='xhtml'):
	"""
	Search for matching tags in <code>html</code>, starting from
	<code>start_ix</code> position. The result is automatically saved
	in <code>last_match</code> property
	"""
	return _find_pair(html, start_ix, mode, save_match)

def find(html, start_ix, mode='xhtml'):
	"""
	Search for matching tags in <code>html</code>, starting from
	<code>start_ix</code> position.
	"""
	return _find_pair(html, start_ix, mode)

def get_tags(html, start_ix, mode='xhtml'):
	"""
	Search for matching tags in <code>html</code>, starting from 
	<code>start_ix</code> position. The difference between 
	<code>match</code> function itself is that <code>get_tags</code> 
	method doesn't save matched result in <code>last_match</code> property 
	and returns array of opening and closing tags
	This method is generally used for lookups
	"""
	return _find_pair(html, start_ix, mode, lambda op, cl=None, ix=0: (op, cl) if op and op.type == 'tag' else None)


def _find_pair(html, start_ix, mode='xhtml', action=make_range):
	"""
	Search for matching tags in <code>html</code>, starting from
	<code>start_ix</code> position
	
	@param html: Code to search
	@type html: str
	
	@param start_ix: Character index where to start searching pair
	(commonly, current caret position)
	@type start_ix: int
	
	@param action: Function that creates selection range
	@type action: function
	
	@return: list
	"""

	forward_stack = []
	backward_stack = []
	opening_tag = None
	closing_tag = None
	html_len = len(html)
	
	set_mode(mode)

	def has_match(substr, start=None):
		if start is None:
			start = ix

		return html.find(substr, start) == start


	def find_comment_start(start_pos):
		while start_pos:
			if html[start_pos] == '<' and has_match('<!--', start_pos):
				break

			start_pos -= 1

		return start_pos

#    find opening tag
	ix = start_ix - 1
	while ix >= 0:
		ch = html[ix]
		if ch == '<':
			check_str = html[ix:]
			m = re.match(end_tag, check_str)
			if m:  # found closing tag
				tmp_tag = Tag(m, ix)
				if tmp_tag.start < start_ix and tmp_tag.end > start_ix: # direct hit on searched closing tag
					closing_tag = tmp_tag
				else:
					backward_stack.append(tmp_tag)
			else:
				m = re.match(start_tag, check_str)
				if m: # found opening tag
					tmp_tag = Tag(m, ix);
					if tmp_tag.unary:
						if tmp_tag.start < start_ix and tmp_tag.end > start_ix: # exact match
							return action(tmp_tag, None, start_ix)
					elif backward_stack and backward_stack[-1].name == tmp_tag.name:
						backward_stack.pop()
					else: # found nearest unclosed tag
						opening_tag = tmp_tag
						break
				elif check_str.startswith('<!--'): # found comment start
					end_ix = check_str.find('-->') + ix + 3;
					if ix < start_ix and end_ix >= start_ix:
						return action(Comment(ix, end_ix))
		elif ch == '-' and has_match('-->'): # found comment end
			# search left until comment start is reached
			ix = find_comment_start(ix)

		ix -= 1
		
	if not opening_tag:
		return action(None)
	
	# find closing tag
	if not closing_tag:
		ix = start_ix
		while ix < html_len:
			ch = html[ix]
			if ch == '<':
				check_str = html[ix:]
				m = re.match(start_tag, check_str)
				if m: # found opening tag
					tmp_tag = Tag(m, ix);
					if not tmp_tag.unary:
						forward_stack.append(tmp_tag)
				else:
					m = re.match(end_tag, check_str)
					if m:   #found closing tag
						tmp_tag = Tag(m, ix);
						if forward_stack and forward_stack[-1].name == tmp_tag.name:
							forward_stack.pop()
						else:  # found matched closing tag
							closing_tag = tmp_tag;
							break
					elif has_match('<!--'): # found comment
						ix += check_str.find('-->') + 3
						continue
			elif ch == '-' and has_match('-->'):
				# looks like cursor was inside comment with invalid HTML
				if not forward_stack or forward_stack[-1].type != 'comment':
					end_ix = ix + 3
					return action(Comment( find_comment_start(ix), end_ix ))
				
			ix += 1
	
	return action(opening_tag, closing_tag, start_ix)