-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathdisassembler.py
811 lines (617 loc) · 34.9 KB
/
disassembler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import (ascii, bytes, chr, dict, filter, hex, input,
int, map, next, oct, open, pow, range, round,
str, super, zip)
import re
from hexagondisasm import common
from hexagondisasm.common import HexagonInstruction, HexagonPacket
from hexagondisasm.common import InstructionOperand, InstructionRegister, InstructionImmediate
from hexagondisasm.common import ImmediateTemplate, RegisterTemplate
from hexagondisasm.common import UnexpectedException, UnknownInstructionException
from hexagondisasm.common import extract_bits, INST_SIZE
class HexagonDisassembler(object):
"""Hexagon disassembler.
Attributes:
inst_templates (List[InstructionTemplate]): List of instruction templates generated by the decoder.
curr_packet (HexagonPacket): Packet that contains the current instruction.
objdump_compatible (bool): Used to produce objdump compatible syntax, to test
the effectiveness of the disassembler against Qualcomm's objdump. Many
times the objdump syntax is not the preferred one (e.g., when using the
disassembler for the IDA processor module), so it can be disabled.
segmented_inst_templates (Dict[int, List[InstructionTemplate]]): Dictionary of
lists of instruction templates, classified by their 4 ICLASS bits,
not including the duplex instructions. Each entry in the dict. is indexed by the
ICLASS bits, and contains the segment of instructions belonging to that ICLASS.
duplex_templates (List[InstructionTemplate]): List of duplex instructions templates, that
are separated from the rest of the templates in `segmented_inst_templates`.
"""
__slots__ = ['inst_templates', 'curr_packet', 'segmented_inst_templates',
'duplex_templates', 'objdump_compatible',]
def __init__(self, objdump_compatible = False):
self.inst_templates = common.pickle_load(common.INST_TEMPL_PATH)
self.curr_packet = None
self.objdump_compatible = objdump_compatible
# Classify the (non duplex) instructions by the ICLASS bits (31:28),
# which are always fixed to 0/1. This improves performance at the time
# to find an instruction template match, because the search will be limited
# to the reduced template segment indexed by these 4 bits.
#
# The duplex instructions go in a separate segment. First, because their
# ICLASS bits have different positions (bits 31:29 and 13). Second, because
# the duplex instructions require a "don't have" match for their PP (parse) bits.
# In a normal instruction template bit matching, a certain (defined) pattern is
# being looked for, e.g., ``0101xxx101xx1...`` (``x``: can have any value).
# But for duplex instructions, apart from the "have certain bits" match,
# another condition has to be met, that the PP bits are NOT set to 00.
# This negative condition is harder to implement in the current framework,
# therefore the duplex instructions are processed separately.
# TODO: rewrite this explanation.
self.segmented_inst_templates = {}
self.duplex_templates = []
for inst in self.inst_templates:
if inst.is_duplex:
self.duplex_templates.append(inst)
# TODO: The duplex instructions can be segmented too, but I don't know if their quantity merits that split.
else:
iclass = int(inst.encoding.text[0:4], 2)
if iclass not in self.segmented_inst_templates:
self.segmented_inst_templates[iclass] = []
self.segmented_inst_templates[iclass].append(inst)
# TODO: Move the segmentation to the decoding phase.
def process_constant_extender(self, hi):
"""Process (if exists) a constant extender from the previous instruction, and apply it to this one.
If the previous instruction was a constant extender (``immext``), it has to be
applied to one of the immediate operands of this instruction. Which one of the
immediate operands it has to be applied to depends on the type of the instruction,
as specified in Table 10-10.
To avoid coding all the information of that table inside this function some
simplifications have been applied. First, if the instruction has only one
immediate operand, then it has to be applied to that one. Second, the
``HexagonInstructionDecoder``, in ``resolve_constant_extender``, takes advantage
of the behavior of the instruction (``apply_extension`` function in the
instruction's behavior) to infer which operand the extension applies.
Note (from the manual): "When constant extenders are used, scaled immediates are
not scaled by the processor. Instead, the assembler must encode the full 32-bit
unscaled value."
Args:
hi (HexagonInstruction): Current instruction being disassembled.
Returns:
None: the extension is applied to the HexagonInstruction itself.
"""
if self.curr_packet.n_inst() < 2:
# There has to be at least 2 instructions in the packet so far to apply a constant
# extension, the ``immext`` and the following instruction to apply it to.
return
if self.curr_packet.get_before_last_inst().immext is None:
# Previous instruction was not a constant extender.
return
if len(hi.imm_ops) == 0:
raise UnknownInstructionException(
"Previous instruction was an 'immext', but current instruction doesn't have "
"any immediate operands to apply the extension to."
)
if len(hi.imm_ops) > 2:
raise UnknownInstructionException(
"Instruction has more than 2 immediate operands ({:d}). No instruction "
"studied so far has been observed to have more than that, this is probably "
"an error from the parsing/decoding stages.".format(len(hi.imm_ops))
)
# Although having more than 2 imm. ops. impacts the logic of this function,
# the check should be done prior to the disassembling stage.
# TODO: Move this check to a more adequate function, maybe in the decoding stage.
extension_target = None # type: InstructionImmediate
# The immediate operand to which the constant extension will be applied.
if len(hi.imm_ops) == 1:
extension_target = hi.imm_ops[0]
# If there is only one immediate operand, then this is the one to be extended.
elif hi.template.imm_ext_op:
extension_target = hi.get_real_operand(hi.template.imm_ext_op)
# Two imm. operands, rely on the `imm_ext_op` indicator generated by the decoder.
else:
extension_target = hi.imm_ops[0]
# The decoder couldn't figure out which of the two imm. op. the
# extension applies to. Arbitrarily, decide to apply it to the
# first one.
# This case shouldn't be happening, there should always be a call
# to ``apply_extension`` in the behavior of an instruction whose imm.
# op. can be extended.
# TODO: Log this case if it happens.
extension_target.value = (
self.curr_packet.get_before_last_inst().immext |
extract_bits(extension_target.field_value, 5, 0)
)
# When an immediate value is being extended, just the lower 6 bits of its original value
# remain, the rest are taken from the constant extender (`immext`). The `immext` value
# has already been left shifted 6 positions.
extension_target.is_extended = True
return
def fill_in_reg_info(self, reg, hi):
"""Set the register operand value and text format.
Args:
reg (InstructionRegister): Target register operand.
hi (HexagonInstruction): Current instruction being disassembled..
Returns:
None: the data is applied to the InstructionRegister itself.
TODOs:
* Split in two functions for register pair and single register.
* And maybe also split in more functions regarding register type, particularly for New-value.
"""
if reg.template.is_register_pair:
# Register pair, e.g., R5:4. From the field value determine both
# register numbers: odd and even.
if hi.template.mult_inst is False:
# TODO: It's not clear how the odd/even numbers of a register pair are specified.
# I'm assuming that if the register field value is odd,
# then it corresponds to the number of the first register of the pair,
# if it's even, it's referring to the second number of the pair.
# The order is always ``R_odd:even`` (odd > even), so the other register
# number (that is not specified by the field value) is set accordingly
# to respect this order.
if reg.field_value % 2 == 0:
odd, even = reg.field_value + 1, reg.field_value
else:
odd, even = reg.field_value, reg.field_value - 1
else: # Duplex instruction.
# TODO: Differentiate between duplex and mult_inst (that includes compound).
# I think this case applies only to the duplex case, so that attribute (and
# not `mult_inst`) should be tested in the if.
# Map a field value to a pair or register numbers. Copied from Table 10-3
# of the manual, as not to make a miss, could be reduced to a formula.
register_pair_map = {
0b000: (1, 0),
0b001: (3, 2),
0b010: (5, 4),
0b011: (7, 6),
0b100: (17, 16),
0b101: (19, 18),
0b110: (21, 20),
0b111: (23, 22),
}
odd, even = register_pair_map[reg.field_value]
if self.objdump_compatible:
reg.name = reg.template.syntax_name.replace(
reg.field_char * 2,
"{:d}:{:d}".format(odd, even)
)
else:
reg.name = reg.template.syntax_name.replace(
reg.field_char * 2,
"{:d}:{:s}{:d}".format(odd,reg.template.syntax_name[0], even)
)
# Prefer full register names: "r7:r6" (instead of "r7:6"), to take advantage of the IDA
# text highlighting feature, to easily spot register references.
return
# Single register case.
# ---------------------
if reg.template.syntax_name[0] == 'N':
# From the manual, 10.11 New-value operands: "Instructions that include a new-value
# register operand specify in their encodings which instruction in the
# packet has its destination register accessed as the new-value register."
#
# In the manual it mentions without a clear definition the terms consumer
# and producer. I understand the term "producer" as the destination register
# in a instruction with an assignment (a register to the left of '=').
producer_distance = extract_bits(reg.field_value, 2, 1) # type: int
# From the manual:
# Nt[2:1] encodes the distance (in instructions) from the producer to
# the consumer, as follows:
# Nt[2:1] = 00 // reserved
# Nt[2:1] = 01 // producer is +1 instruction ahead of consumer
# Nt[2:1] = 10 // producer is +2 instructions ahead of consumer
# Nt[2:1] = 11 // producer is +3 instructions ahead of consumer
if producer_distance == 0:
raise UnknownInstructionException(
"New-value operands with a (invalid) consumer distance of 0 (reserved value)"
)
# From the current consumer ('Nt') register, try to find the producer,
# that is 1-3 instructions behind (in the same packet), "not counting
# empty slots or constant extenders" (from the manual).
#
# I'm not sure what an "empty slot" is, besides maybe a nop, but real
# cases show that nop is taken into account in the distance, and the
# only thing that is ignored are constant extenders.
producer_inst = None # type: HexagonInstruction
distance_walked = 0
for packet_inst in reversed(self.curr_packet.instructions[0:-1]):
# Walk the packet in reverse order, from the current instruction,
# containing the consumer register, to the first one.
# TODO: avoid direct access to 'self.curr_packet.instructions'.
if packet_inst.immext is None:
# Not a constant extender instruction, applies to the distance count.
distance_walked += 1
if distance_walked == producer_distance:
producer_inst = packet_inst
break
if producer_inst is None:
raise UnknownInstructionException(
"New-value register operand with a producer distance of {:d} "
"doesn't correspond to a producer instruction.".format(producer_distance)
)
# It may happen that the disassembler is called for random instruction (i.e.,
# not in sequential address order), and I don't have the previous instructions
# of the packet to find the producer.
# TODO: Is there a better way to handle it than to raise an exception as before?
# The instruction with the producer register has been found, now capture the
# name of the producer register name inside that instruction.
m = re.search(r"""
# Looking for something like: "R14 = ..."
( # Open a capture group for the reg. name.
r # The producer register is supposed to be a general
# purpose one (Rx). The reg. name is in lowercase (hence
# the use of a lower 'r'), converted by populate_syntax.
\d{1,2} # Register number (0-31).
) # End of the capture group, only care for the reg. name.
\s *
.? # Used to cover for cases of compound assignment (e.g.,
# '+=', '&=', etc.)
= # The producer register has to be the target of an assignment
# (i.e., to the left of the '=')
""", producer_inst.text, re.X)
# TODO: There may be more than one assignment ('=' in the syntax), if there are multiple instructions.
if m is None:
raise UnknownInstructionException(
"New-value operand with a producer instruction that is not producing "
"a new register operand. The pattern 'Rx = ...' was not found.")
reg.name = reg.template.syntax_name.replace('N' + reg.field_char, m.group(1))
# Replace the consumer register placeholder 'Nt.new' with the name of the actual
# producer register, e.g., 'R14', resulting in the reg. name: 'R14.new'.
return
# Single register (not a new-value register operand).
# TODO: The most common case ends up at the end of a very long function.
reg_number = reg.field_value
if hi.template.mult_inst:
# TODO: Check and replace `mult_inst` with `is_duplex`. Those are two different checks
# (even though it is working like this for unknown reasons).
# Instruction duplex. Table 10-3: single register case. Field values from 0-7 match
# exactly to reg. numbers 0-7. Field values from 8-15, on the other hand, match a
# consecutive number range of 16-23, which is the field value plus 8.
if reg_number > 7:
reg_number += 8
reg.name = reg.template.syntax_name.replace(reg.field_char, str(reg_number))
return
def fill_in_imm_info(self, imm, hi):
"""Set the immediate operand value (except for constant extensions) and text format.
Set the immediate operand value and text format according to the operand type.
The constant extension has to be performed prior to this function, calling
``process_constant_extender``.
Args:
imm (InstructionImmediate): Target immediate operand.
hi (HexagonInstruction): Current instruction being disassembled..
Returns:
None: the data is applied to the InstructionImmediate itself.
TODOs:
* Handle the case of resulting negative values from constant-extended imm. ops.
"""
if imm.is_extended is False:
# Immediate operand was not extended, apply value (with sign) and scaling.
imm.value = imm.field_value
if imm.template.type in ['r', 'm', 's']:
# The immediate operand type is signed.
bit_len = hi.template.encoding.fields[imm.field_char].mask_len
# TODO: Find a shorter way to get the op. mask len, and avoid using HexagonInstruction
# directly, it's the only reason the `hi` argument was added to this function.
imm.value = common.get_signed_value(imm.value, bit_len)
imm.value <<= imm.template.scaled
if self.objdump_compatible:
if imm.is_extended:
imm.print_format = '##{:d}'
else:
imm.print_format = '#{:d}'
else:
imm.print_format = '#{:X}'
# I prefer hexadecimal values, don't care about the double hash.
if (imm.template.type == 'm'):
# Special case: Modifier registers.
imm.value += 1
# TODO: The min-max range for this type in Table 1-3 doesn't seem to add up.
if (imm.template.type == 'r'):
# Special case: imm. operand used as a target by jump/call,
# it's relative to PC (added to the packet address)
# and usually printed in hex without the '#'.
imm.value &= ~0x3
imm.value += hi.packet.address
if imm.value < 0:
raise UnknownInstructionException(
"Branch target (taken from an imm. op. of type 'r') "
"resulted in a negative value: {:x}".format(imm.value)
)
if self.objdump_compatible:
imm.print_format = "0x{:x}"
# Addresses are printed in hexadecimal with the 0x prefix and without the '#'
# (this format was only observed for branch targets).
else:
imm.print_format = "{:x}"
def generate_inst_text(self, hi):
"""Get the instruction text output.
Args:
hi (HexagonInstruction): Current instruction being disassembled..
Returns:
str: instruction text.
Raises:
UnknownInstructionException: If the instruction is unknown and a text can't be produced.
TODOs:
* Change function name to something like ``get/extract inst_text (_output)``. It's not exactly
``get_inst_text``, because the text is not completely defined here, the start/end packet
``{}`` and the ``endloop`` tags have to be added. So what would be the correct terminology
for the string inside the packet ``{}``. Split between full instruction text, and the
"inner" instruction text.
"""
if hi.template is None and hi.immext is None:
# It's neither a recognized instruction (from a known template) nor a constant extender.
# TODO: Elaborate on this, why this two attributes have to be none?
raise UnknownInstructionException('Instruction not recognized.')
# TODO: Move this raise to the caller? although the UnknownInstruction pattern works correctly.
if hi.immext is not None:
# Constant extender.
if self.objdump_compatible:
return 'immext (#{:d})'.format(hi.immext)
else:
return 'immext'
# I don't care about the extension value, the final value will be shown in the next instruction.
inst_text = hi.template.syntax
# Get the immediate values and register names, and replace them in the
# instruction syntax.
self.process_constant_extender(hi)
for imm in hi.imm_ops:
self.fill_in_imm_info(imm, hi)
inst_text = inst_text.replace(imm.template.syntax_name, repr(imm))
# E.g., 'Rd = add(Rs, #s16)' -> 'Rd = add(Rs, 2BF4)'
for reg in hi.reg_ops:
self.fill_in_reg_info(reg, hi)
inst_text = inst_text.replace(reg.template.syntax_name, reg.name)
# E.g., 'Rd = add(Rs, 2BF4)' -> 'R8 = add(Rs, 2BF4)' (first iteration)
# 'R8 = add(Rs, 2BF4)' -> 'R8 = add(R17, 2BF4)' (second iteration)
inst_text = inst_text.lower()
# Like objdump.
# TODO: Check objdump_compatible and only lower it on that case? Not lowering reg. names
# will need adjusting in some regex and str. manipulation. For simplicity it can be left
# like this (no check, always lower text).
return inst_text
def generate_instruction_operands(self, inst, hi):
"""Generate the instruction operands from the template operands.
Args:
inst(int): Actual instruction value.
hi (HexagonInstruction): Current instruction being disassembled..
Returns:
None: the generated operands are stored in `HexagonInstruction.imm_ops` and `reg_ops`.
Raises:
UnexpectedException: If for some error there is a different type of template operand
than the register or the immediate.
TODOs:
* Remove `inst` argument, maybe move it as an attribute of the HexagonInstruction.
"""
for c in hi.template.encoding.fields:
if c == 'N':
continue
# TODO: Handle 'N' field char.
inst_op = None # type: InstructionOperand
# Instruction operand being generated (from the template operand).
if isinstance(hi.template.operands[c], ImmediateTemplate):
inst_op = InstructionImmediate()
hi.imm_ops.append(inst_op)
elif isinstance(hi.template.operands[c], RegisterTemplate):
inst_op = InstructionRegister()
hi.reg_ops.append(inst_op)
else:
raise UnexpectedException("Unknown operand type.")
inst_op.field_char = c
inst_op.field_value = self.extract_and_join_mask_bits(inst, hi.template.encoding.fields[c])
inst_op.template = hi.template.operands[c]
# The instruction operand has a "pointer" to the template operand from which
# it was created. It's redundant, but allows a more cleaner access than going through
# HexagonInstruction.
# TODO: Move this comment to the docstring of the InstructionOperand class.
return
def process_endloops(self, hi):
"""Process (if exists) a hardware loop end.
Checks if this instruction signals the end of a hardware loop (e.g.,
``endloop0``). The presence of the ``endloop`` signal is indicated in the HexagonInstruction
by saving the number of the loop being ended (0 or 1) in its `endloop` attribute.
The function ``process_packet_info`` needs to be called before this instruction,
as it is needed to know if this instruction is the last one of the packet.
Args:
hi (HexagonInstruction): Current instruction being disassembled..
Returns:
None: the processing is done inside the `endloop` attribute.
TODOs:
* endloop1 analysis.
"""
if not hi.end_packet:
# Only the last instruction of the packet can signal the end of the loop.
return
# Check for "Last in loop 0" (``endloop0``).
if self.curr_packet.n_inst() >= 2:
# "The last packet in a hardware loop 0 must contain two or
# more instructions." (From the manual.)
if self.curr_packet.get_inst(0).parse_bits == 0b10:
# Parse Field in First Instruction: 10 (Table 10-7).
if self.curr_packet.get_inst(1).parse_bits in [0b01, 0b11]:
# Parse Field in Second Instruction: 01 or 11 (Table 10-7).
hi.endloop.append(0)
elif self.curr_packet.get_inst(1).parse_bits in [0b10]:
hi.endloop.append(0)
# The table and the examples don't seem to add up. The examples
# where both loops end, i.e., ``:endloop0:endloop1``, have parse bits
# values of 10 in the second instruction, which would violate the
# rule in Table 10-7, which indicates that for ``endloop0`` only 01 or 11
# are allowed in the second instruction. For now I'm adding the 10 case,
# as an exception.
# TODO: Check real examples.
return
def process_packet_info(self, hi, inst):
"""Process packet information.
Keeping track of all the instructions in the packet is necessary as many
instructions depend on previous ones (e.g., constant extenders), and this
dependency is only limited to the packet: all the information needed to
correctly disassemble the instructions is in the packet itself.
The disassembler is designed to be used in sequential mode, disassembling
all the instructions in the same packet one after the other. A single instruction
can't be correctly analyzed outside that scope (although IDA analysis sometimes
does that).
During a packet disassembly, if an instruction from a different packet is
disassembled (calling `disasm_one_inst`) all the current packet information
is lost. All the instructions of a single packet have to be disassembled in
continuous order.
Args:
hi (HexagonInstruction): Current instruction being disassembled.
inst (int): Actual instruction value.
Returns:
None
TODOs:
* Review and move part of this docstring to the project documentation.
* Remove the `inst` argument once it is added to the HexagonInstruction class.
"""
# Check if a new packet is being disassembled, either because:
# 1. This is the first ever instruction being disassembled (i.e.,
# ``curr_packet`` is None).
# 2. The previous (contiguous) instruction was the end of its packet,
# therefore this instruction has to start a new one.
# 3. The previous disassembled instruction is not contiguous (an address
# that is not 4 bytes back), so it has to be assumed (for lack of any
# other information) that a new packet is being disassembled. There
# is no way to know for sure that this instruction is indeed the first one
# in the packet (the parse bits only indicate the last, but not the
# first instruction), so it's the safest bet (assuming the disassembler
# is being correctly used a jump to the middle of tha packet is not allowed).
if self.curr_packet is None:
hi.start_packet = True
# Case 1.
elif hi.addr - INST_SIZE == self.curr_packet.get_last_inst().addr:
# There's a continuity in the disassembler use.
if self.curr_packet.get_last_inst().end_packet:
hi.start_packet = True
# Case 2.
else:
hi.start_packet = False
# The current packet continues with this instruction.
else:
hi.start_packet = True
# Case 3.
if hi.start_packet:
self.curr_packet = HexagonPacket(hi)
# If it is the first instruction in the packet it has to be new one.
else:
self.curr_packet.add_next_inst(hi)
# This instruction continues the current packet so it's added to the list.
hi.packet = self.curr_packet
# TODO: Maybe there's some overlapping here and I don't need `self.curr_packet`.
# Check if this instruction is the end of the packet, which is indicated by
# the PP (parity) bits if their value is:
# 1. '11' for a normal instruction, signals packet end.
# 2. '00' signals a duplex instruction, and from the manual: "The duplex
# must always appear as the last word in a packet."
hi.parse_bits = extract_bits(inst, 15, 14)
if hi.parse_bits in [0b00, 0b11]:
hi.end_packet = True
else:
hi.end_packet = False
# TODO: Perform two different checks. The normal PP == 11, and `hi.is_duplex` in
# another if (`is_duplex` has to be set first, which is not happening now).
return
def extract_and_join_mask_bits(self, inst, encoding_field):
"""Extract a field value from an instruction, based on the field encoding.
Args:
inst (int): Actual instruction value from which the field value is extracted.
encoding_field (EncodingField): field whose value will be extracted.
Returns:
int: extracted field value.
TODOs:
* Change function name, what is being is extracted is a field value, not a mask,
and the join should be implicit.
"""
# The (most common) case of no mask split is processed separately for performance reasons.
if encoding_field.no_mask_split:
extracted_value = inst >> encoding_field.mask_lower_pos
extracted_value &= (2 ** encoding_field.mask_len) - 1
return extracted_value
# TODO: Use `extract_bits`.
# Case when the field is not unified, the field chars are scattered, the extracted
# bits have to be unified.
extracted_value = 0
for pos in range(31, -1, -1):
if encoding_field.mask & (1 << pos):
extracted_value = (extracted_value << 1) | ((inst & (1 << pos)) >> pos)
# TODO: Too clobbered, split in two or three lines.
# TODO: Use set_bit and clear_bit functions.
return extracted_value
def disasm_one_inst(self, inst, addr = 0):
"""Disassemble one instruction value interpreted as an unsigned int.
Args:
inst (int): Actual instruction value.
addr (Optional[int]): Address of the instruction being disassembled (used for
packet processing purposes).
Returns:
HexagonInstruction: disassembled instruction.
TODOs:
* Define the input type, for now I it's an unsigned int with the endianness (little endiand) resolved.
"""
if not isinstance(inst, int):
raise UnexpectedException()
if inst < 0 or inst > 0xFFFFFFFF:
raise UnexpectedException()
hi = HexagonInstruction()
hi.addr = addr
self.process_packet_info(hi, inst)
hi.is_duplex = (hi.parse_bits == 0b00)
if extract_bits(inst, 31, 28) == 0 and hi.is_duplex == False:
# Constant extender instruction, extract the extension value:
# bits 27:16 | 13:0, joined and moved to the upper 26 bits.
hi.immext = (extract_bits(inst, 27, 16) << 14) | extract_bits(inst, 13, 0)
hi.immext <<= 6
# TODO: Move to a separate function.
else:
# Not a constant extender function. Search available templates for a match.
if self.find_template(inst, hi):
self.generate_instruction_operands(inst, hi)
packet_prefix = '{ ' if hi.start_packet else ' '
hi.text += packet_prefix
try:
hi.text += self.generate_inst_text(hi)
# TODO: Move all str manipulation to `generate_inst_text` function? The nice thing of the
# current arrangement is the exception catch, where I can have an unknown with {}
# (i.e., ``{ <unknown> }``) even if the disassembly failed.
except UnknownInstructionException as e:
hi.text += "<unknown>"
hi.is_unknown = True
if hi.end_packet:
hi.text += ' }'
# Even if the instruction is unknown, the parity bits analysis is
# still valid, so the start/end packet settings stand, e.g.,
# ``{ <unknown> }`` is a valid text output.
self.process_endloops(hi)
if 0 in hi.endloop:
hi.text += ':endloop0'
return hi
def find_template(self, inst, hi):
"""Find the template for an instruction value.
Args:
inst (int): Actual instruction value.
hi (HexagonInstruction): Instruction object where the `template` attribute is set
to the value of the found (if any) template.
Returns:
bool: True if a template was found; False otherwise.
TODOs:
* Improve performance.
"""
template_sources = [] # type: List[InstructionTemplate]
if hi.is_duplex:
template_sources = self.duplex_templates
else:
template_sources = self.segmented_inst_templates[extract_bits(inst, 31, 28)]
i = 0
template_sources_len = len(template_sources)
# Length precomputed to improve performance.
while i < template_sources_len:
template = template_sources[i]
# TODO: A while is used instead of a ``for ... in enumerate`` because I'm not sure I can
# modify the list being enumerated (see below).
if inst & template.encoding.mask == template.encoding.value:
hi.template = template
# Found a template match.
# TODO: Small hack to partially reorder the list by most found, one swap at time, should be improved.
if i != 0:
template_sources[i], template_sources[i - 1] = template_sources[i - 1], template_sources[i]
return True
i += 1
return False