1use crate::error::Location;
4use cranelift_codegen::ir::types;
5use cranelift_codegen::ir::{Block, Value};
6use std::str::CharIndices;
7use std::u16;
8
9#[derive(Debug, PartialEq, Eq, Clone, Copy)]
14pub enum Token<'a> {
15 Comment(&'a str),
16 LPar, RPar, LBrace, RBrace, LBracket, RBracket, LAngle, RAngle, Minus, Plus, Multiply, Comma, Dot, Colon, Equal, Bang, At, Arrow, Float(&'a str), Integer(&'a str), Type(types::Type), DynamicType(u32), Value(Value), Block(Block), Cold, StackSlot(u32), DynamicStackSlot(u32), GlobalValue(u32), MemoryType(u32), Constant(u32), FuncRef(u32), SigRef(u32), UserRef(u32), UserNameRef(u32), ExceptionTableRef(u32), ExceptionTag(u32), TryCallRet(u32), TryCallExn(u32), Name(&'a str), String(&'a str), HexSequence(&'a str), Identifier(&'a str), SourceLoc(&'a str), }
60
61#[derive(Debug, PartialEq, Eq)]
63pub struct LocatedToken<'a> {
64 pub token: Token<'a>,
65 pub location: Location,
66}
67
68fn token(token: Token, loc: Location) -> Result<LocatedToken, LocatedError> {
70 Ok(LocatedToken {
71 token,
72 location: loc,
73 })
74}
75
76#[derive(Debug, Clone, Copy, PartialEq, Eq)]
78pub enum LexError {
79 InvalidChar,
80}
81
82#[derive(Debug, Clone, Copy, PartialEq, Eq)]
84pub struct LocatedError {
85 pub error: LexError,
86 pub location: Location,
87}
88
89fn error<'a>(error: LexError, loc: Location) -> Result<LocatedToken<'a>, LocatedError> {
91 Err(LocatedError {
92 error,
93 location: loc,
94 })
95}
96
97fn trailing_digits(s: &str) -> usize {
99 s.as_bytes()
101 .iter()
102 .rev()
103 .take_while(|&&b| b'0' <= b && b <= b'9')
104 .count()
105}
106
107pub fn split_entity_name(name: &str) -> Option<(&str, u32)> {
110 let (head, tail) = name.split_at(name.len() - trailing_digits(name));
111 if tail.len() > 1 && tail.starts_with('0') {
112 None
113 } else {
114 tail.parse().ok().map(|n| (head, n))
115 }
116}
117
118pub struct Lexer<'a> {
125 source: &'a str,
127
128 chars: CharIndices<'a>,
130
131 lookahead: Option<char>,
133
134 pos: usize,
136
137 line_number: usize,
139}
140
141impl<'a> Lexer<'a> {
142 pub fn new(s: &'a str) -> Self {
143 let mut lex = Self {
144 source: s,
145 chars: s.char_indices(),
146 lookahead: None,
147 pos: 0,
148 line_number: 1,
149 };
150 lex.next_ch();
152 lex
153 }
154
155 fn next_ch(&mut self) -> Option<char> {
159 if self.lookahead == Some('\n') {
160 self.line_number += 1;
161 }
162 match self.chars.next() {
163 Some((idx, ch)) => {
164 self.pos = idx;
165 self.lookahead = Some(ch);
166 }
167 None => {
168 self.pos = self.source.len();
169 self.lookahead = None;
170 }
171 }
172 self.lookahead
173 }
174
175 fn loc(&self) -> Location {
177 Location {
178 line_number: self.line_number,
179 }
180 }
181
182 fn looking_at(&self, prefix: &str) -> bool {
184 self.source[self.pos..].starts_with(prefix)
185 }
186
187 fn looking_at_numeric(&self) -> bool {
189 if let Some(c) = self.lookahead {
190 match c {
191 '0'..='9' => return true,
192 '-' => return true,
193 '+' => return true,
194 '.' => return true,
195 _ => {}
196 }
197 if self.looking_at("NaN") || self.looking_at("Inf") || self.looking_at("sNaN") {
198 return true;
199 }
200 }
201 false
202 }
203
204 fn scan_char(&mut self, tok: Token<'a>) -> Result<LocatedToken<'a>, LocatedError> {
206 assert_ne!(self.lookahead, None);
207 let loc = self.loc();
208 self.next_ch();
209 token(tok, loc)
210 }
211
212 fn scan_chars(
214 &mut self,
215 count: usize,
216 tok: Token<'a>,
217 ) -> Result<LocatedToken<'a>, LocatedError> {
218 let loc = self.loc();
219 for _ in 0..count {
220 assert_ne!(self.lookahead, None);
221 self.next_ch();
222 }
223 token(tok, loc)
224 }
225
226 pub fn rest_of_line(&mut self) -> &'a str {
229 let begin = self.pos;
230 loop {
231 match self.next_ch() {
232 None | Some('\n') => return &self.source[begin..self.pos],
233 _ => {}
234 }
235 }
236 }
237
238 fn scan_comment(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
240 let loc = self.loc();
241 let text = self.rest_of_line();
242 token(Token::Comment(text), loc)
243 }
244
245 fn scan_number(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
261 let begin = self.pos;
262 let loc = self.loc();
263 let mut is_float = false;
264
265 match self.lookahead {
267 Some('-') => {
268 self.next_ch();
269 if !self.looking_at_numeric() {
270 return token(Token::Minus, loc);
272 }
273 }
274 Some('+') => {
275 self.next_ch();
276 if !self.looking_at_numeric() {
277 return token(Token::Plus, loc);
279 }
280 }
281 _ => {}
282 }
283
284 if self.looking_at("NaN:") || self.looking_at("sNaN:") {
286 while self.next_ch() != Some(':') {}
289 is_float = true;
290 } else if self.looking_at("NaN") || self.looking_at("Inf") {
291 is_float = true;
293 }
294
295 loop {
297 match self.next_ch() {
298 Some('-') | Some('_') => {}
299 Some('.') => is_float = true,
300 Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
301 _ => break,
302 }
303 }
304 let text = &self.source[begin..self.pos];
305 if is_float {
306 token(Token::Float(text), loc)
307 } else {
308 token(Token::Integer(text), loc)
309 }
310 }
311
312 fn scan_word(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
315 let begin = self.pos;
316 let loc = self.loc();
317
318 assert!(self.lookahead == Some('_') || self.lookahead.unwrap().is_ascii_alphabetic());
319 loop {
320 match self.next_ch() {
321 Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
322 _ => break,
323 }
324 }
325 let text = &self.source[begin..self.pos];
326
327 token(
329 split_entity_name(text)
330 .and_then(|(prefix, number)| {
331 Self::numbered_entity(prefix, number)
332 .or_else(|| Self::value_type(text, prefix, number))
333 })
334 .unwrap_or_else(|| match text {
335 "cold" => Token::Cold,
336 _ => Token::Identifier(text),
337 }),
338 loc,
339 )
340 }
341
342 fn numbered_entity(prefix: &str, number: u32) -> Option<Token<'a>> {
345 match prefix {
346 "v" => Value::with_number(number).map(Token::Value),
347 "block" => Block::with_number(number).map(Token::Block),
348 "ss" => Some(Token::StackSlot(number)),
349 "dss" => Some(Token::DynamicStackSlot(number)),
350 "dt" => Some(Token::DynamicType(number)),
351 "gv" => Some(Token::GlobalValue(number)),
352 "mt" => Some(Token::MemoryType(number)),
353 "const" => Some(Token::Constant(number)),
354 "fn" => Some(Token::FuncRef(number)),
355 "sig" => Some(Token::SigRef(number)),
356 "u" => Some(Token::UserRef(number)),
357 "userextname" => Some(Token::UserNameRef(number)),
358 "extable" => Some(Token::ExceptionTableRef(number)),
359 "tag" => Some(Token::ExceptionTag(number)),
360 "ret" => Some(Token::TryCallRet(number)),
361 "exn" => Some(Token::TryCallExn(number)),
362 _ => None,
363 }
364 }
365
366 fn value_type(text: &str, prefix: &str, number: u32) -> Option<Token<'a>> {
368 let is_vector = prefix.ends_with('x');
369 let scalar = if is_vector {
370 &prefix[0..prefix.len() - 1]
371 } else {
372 text
373 };
374 let base_type = match scalar {
375 "i8" => types::I8,
376 "i16" => types::I16,
377 "i32" => types::I32,
378 "i64" => types::I64,
379 "i128" => types::I128,
380 "f16" => types::F16,
381 "f32" => types::F32,
382 "f64" => types::F64,
383 "f128" => types::F128,
384 _ => return None,
385 };
386 if is_vector {
387 if number <= u32::from(u16::MAX) {
388 base_type.by(number).map(Token::Type)
389 } else {
390 None
391 }
392 } else {
393 Some(Token::Type(base_type))
394 }
395 }
396
397 fn scan_name(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
398 let loc = self.loc();
399 let begin = self.pos + 1;
400
401 assert_eq!(self.lookahead, Some('%'));
402
403 loop {
404 match self.next_ch() {
405 Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
406 _ => break,
407 }
408 }
409
410 let end = self.pos;
411 token(Token::Name(&self.source[begin..end]), loc)
412 }
413
414 fn scan_string(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
416 let loc = self.loc();
417 let begin = self.pos + 1;
418
419 assert_eq!(self.lookahead, Some('"'));
420
421 while let Some(c) = self.next_ch() {
422 if c == '"' {
423 break;
424 }
425 }
426
427 let end = self.pos;
428 if self.lookahead != Some('"') {
429 return error(LexError::InvalidChar, self.loc());
430 }
431 self.next_ch();
432 token(Token::String(&self.source[begin..end]), loc)
433 }
434
435 fn scan_hex_sequence(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
436 let loc = self.loc();
437 let begin = self.pos + 1;
438
439 assert_eq!(self.lookahead, Some('#'));
440
441 while let Some(c) = self.next_ch() {
442 if !char::is_digit(c, 16) {
443 break;
444 }
445 }
446
447 let end = self.pos;
448 token(Token::HexSequence(&self.source[begin..end]), loc)
449 }
450
451 fn looking_at_srcloc(&self) -> bool {
454 match self.lookahead {
455 Some(c) => char::is_digit(c, 16),
456 _ => false,
457 }
458 }
459
460 fn scan_srcloc(&mut self, pos: usize, loc: Location) -> Result<LocatedToken<'a>, LocatedError> {
461 let begin = pos + 1;
462 while let Some(c) = self.next_ch() {
463 if !char::is_digit(c, 16) {
464 break;
465 }
466 }
467
468 let end = self.pos;
469 token(Token::SourceLoc(&self.source[begin..end]), loc)
470 }
471
472 pub fn next(&mut self) -> Option<Result<LocatedToken<'a>, LocatedError>> {
476 loop {
477 let loc = self.loc();
478 return match self.lookahead {
479 None => None,
480 Some(';') => Some(self.scan_comment()),
481 Some('(') => Some(self.scan_char(Token::LPar)),
482 Some(')') => Some(self.scan_char(Token::RPar)),
483 Some('{') => Some(self.scan_char(Token::LBrace)),
484 Some('}') => Some(self.scan_char(Token::RBrace)),
485 Some('[') => Some(self.scan_char(Token::LBracket)),
486 Some(']') => Some(self.scan_char(Token::RBracket)),
487 Some('<') => Some(self.scan_char(Token::LAngle)),
488 Some('>') => Some(self.scan_char(Token::RAngle)),
489 Some(',') => Some(self.scan_char(Token::Comma)),
490 Some('.') => Some(self.scan_char(Token::Dot)),
491 Some(':') => Some(self.scan_char(Token::Colon)),
492 Some('=') => Some(self.scan_char(Token::Equal)),
493 Some('!') => Some(self.scan_char(Token::Bang)),
494 Some('+') => Some(self.scan_number()),
495 Some('*') => Some(self.scan_char(Token::Multiply)),
496 Some('-') => {
497 if self.looking_at("->") {
498 Some(self.scan_chars(2, Token::Arrow))
499 } else {
500 Some(self.scan_number())
501 }
502 }
503 Some('0'..='9') => Some(self.scan_number()),
504 Some('a'..='z') | Some('A'..='Z') => {
505 if self.looking_at("NaN") || self.looking_at("Inf") {
506 Some(self.scan_number())
507 } else {
508 Some(self.scan_word())
509 }
510 }
511 Some('%') => Some(self.scan_name()),
512 Some('"') => Some(self.scan_string()),
513 Some('#') => Some(self.scan_hex_sequence()),
514 Some('@') => {
515 let pos = self.pos;
516 let loc = self.loc();
517 self.next_ch();
518 if self.looking_at_srcloc() {
519 Some(self.scan_srcloc(pos, loc))
520 } else {
521 Some(token(Token::At, loc))
522 }
523 }
524 Some(' ') | Some('\x09'..='\x0d') => {
526 self.next_ch();
527 continue;
528 }
529 _ => {
530 self.next_ch();
532 Some(error(LexError::InvalidChar, loc))
533 }
534 };
535 }
536 }
537}
538
539#[cfg(test)]
540mod tests {
541 use super::*;
542
543 #[test]
544 fn digits() {
545 assert_eq!(trailing_digits(""), 0);
546 assert_eq!(trailing_digits("x"), 0);
547 assert_eq!(trailing_digits("0x"), 0);
548 assert_eq!(trailing_digits("x1"), 1);
549 assert_eq!(trailing_digits("1x1"), 1);
550 assert_eq!(trailing_digits("1x01"), 2);
551 }
552
553 #[test]
554 fn entity_name() {
555 assert_eq!(split_entity_name(""), None);
556 assert_eq!(split_entity_name("x"), None);
557 assert_eq!(split_entity_name("x+"), None);
558 assert_eq!(split_entity_name("x+1"), Some(("x+", 1)));
559 assert_eq!(split_entity_name("x-1"), Some(("x-", 1)));
560 assert_eq!(split_entity_name("1"), Some(("", 1)));
561 assert_eq!(split_entity_name("x1"), Some(("x", 1)));
562 assert_eq!(split_entity_name("xy0"), Some(("xy", 0)));
563 assert_eq!(split_entity_name("inst01"), None);
565 }
566
567 fn token<'a>(token: Token<'a>, line: usize) -> Option<Result<LocatedToken<'a>, LocatedError>> {
568 Some(super::token(token, Location { line_number: line }))
569 }
570
571 fn error<'a>(error: LexError, line: usize) -> Option<Result<LocatedToken<'a>, LocatedError>> {
572 Some(super::error(error, Location { line_number: line }))
573 }
574
575 #[test]
576 fn make_lexer() {
577 let mut l1 = Lexer::new("");
578 let mut l2 = Lexer::new(" ");
579 let mut l3 = Lexer::new("\n ");
580
581 assert_eq!(l1.next(), None);
582 assert_eq!(l2.next(), None);
583 assert_eq!(l3.next(), None);
584 }
585
586 #[test]
587 fn lex_comment() {
588 let mut lex = Lexer::new("; hello");
589 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
590 assert_eq!(lex.next(), None);
591
592 lex = Lexer::new("\n ;hello\n;foo");
593 assert_eq!(lex.next(), token(Token::Comment(";hello"), 2));
594 assert_eq!(lex.next(), token(Token::Comment(";foo"), 3));
595 assert_eq!(lex.next(), None);
596
597 let mut lex = Lexer::new("$; hello");
599 assert_eq!(lex.next(), error(LexError::InvalidChar, 1));
600 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
601 assert_eq!(lex.next(), None);
602 }
603
604 #[test]
605 fn lex_chars() {
606 let mut lex = Lexer::new("(); hello\n = :{, }.");
607 assert_eq!(lex.next(), token(Token::LPar, 1));
608 assert_eq!(lex.next(), token(Token::RPar, 1));
609 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
610 assert_eq!(lex.next(), token(Token::Equal, 2));
611 assert_eq!(lex.next(), token(Token::Colon, 2));
612 assert_eq!(lex.next(), token(Token::LBrace, 2));
613 assert_eq!(lex.next(), token(Token::Comma, 2));
614 assert_eq!(lex.next(), token(Token::RBrace, 2));
615 assert_eq!(lex.next(), token(Token::Dot, 2));
616 assert_eq!(lex.next(), None);
617 }
618
619 #[test]
620 fn lex_numbers() {
621 let mut lex = Lexer::new(" 0 2_000 -1,0xf -0x0 0.0 0x0.4p-34 NaN +5");
622 assert_eq!(lex.next(), token(Token::Integer("0"), 1));
623 assert_eq!(lex.next(), token(Token::Integer("2_000"), 1));
624 assert_eq!(lex.next(), token(Token::Integer("-1"), 1));
625 assert_eq!(lex.next(), token(Token::Comma, 1));
626 assert_eq!(lex.next(), token(Token::Integer("0xf"), 1));
627 assert_eq!(lex.next(), token(Token::Integer("-0x0"), 1));
628 assert_eq!(lex.next(), token(Token::Float("0.0"), 1));
629 assert_eq!(lex.next(), token(Token::Float("0x0.4p-34"), 1));
630 assert_eq!(lex.next(), token(Token::Float("NaN"), 1));
631 assert_eq!(lex.next(), token(Token::Integer("+5"), 1));
632 assert_eq!(lex.next(), None);
633 }
634
635 #[test]
636 fn lex_identifiers() {
637 let mut lex = Lexer::new(
638 "v0 v00 vx01 block1234567890 block5234567890 v1x vx1 vxvx4 \
639 function0 function i8 i32x4 f32x5 f16 f128",
640 );
641 assert_eq!(
642 lex.next(),
643 token(Token::Value(Value::with_number(0).unwrap()), 1)
644 );
645 assert_eq!(lex.next(), token(Token::Identifier("v00"), 1));
646 assert_eq!(lex.next(), token(Token::Identifier("vx01"), 1));
647 assert_eq!(
648 lex.next(),
649 token(Token::Block(Block::with_number(1234567890).unwrap()), 1)
650 );
651 assert_eq!(lex.next(), token(Token::Identifier("block5234567890"), 1));
652 assert_eq!(lex.next(), token(Token::Identifier("v1x"), 1));
653 assert_eq!(lex.next(), token(Token::Identifier("vx1"), 1));
654 assert_eq!(lex.next(), token(Token::Identifier("vxvx4"), 1));
655 assert_eq!(lex.next(), token(Token::Identifier("function0"), 1));
656 assert_eq!(lex.next(), token(Token::Identifier("function"), 1));
657 assert_eq!(lex.next(), token(Token::Type(types::I8), 1));
658 assert_eq!(lex.next(), token(Token::Type(types::I32X4), 1));
659 assert_eq!(lex.next(), token(Token::Identifier("f32x5"), 1));
660 assert_eq!(lex.next(), token(Token::Type(types::F16), 1));
661 assert_eq!(lex.next(), token(Token::Type(types::F128), 1));
662 assert_eq!(lex.next(), None);
663 }
664
665 #[test]
666 fn lex_hex_sequences() {
667 let mut lex = Lexer::new("#0 #DEADbeef123 #789");
668
669 assert_eq!(lex.next(), token(Token::HexSequence("0"), 1));
670 assert_eq!(lex.next(), token(Token::HexSequence("DEADbeef123"), 1));
671 assert_eq!(lex.next(), token(Token::HexSequence("789"), 1));
672 }
673
674 #[test]
675 fn lex_names() {
676 let mut lex = Lexer::new("%0 %x3 %function %123_abc %ss0 %v3 %block11 %const42 %_");
677
678 assert_eq!(lex.next(), token(Token::Name("0"), 1));
679 assert_eq!(lex.next(), token(Token::Name("x3"), 1));
680 assert_eq!(lex.next(), token(Token::Name("function"), 1));
681 assert_eq!(lex.next(), token(Token::Name("123_abc"), 1));
682 assert_eq!(lex.next(), token(Token::Name("ss0"), 1));
683 assert_eq!(lex.next(), token(Token::Name("v3"), 1));
684 assert_eq!(lex.next(), token(Token::Name("block11"), 1));
685 assert_eq!(lex.next(), token(Token::Name("const42"), 1));
686 assert_eq!(lex.next(), token(Token::Name("_"), 1));
687 }
688
689 #[test]
690 fn lex_strings() {
691 let mut lex = Lexer::new(
692 r#""" "0" "x3""function" "123 abc" "\" "start
693 and end on
694 different lines" "#,
695 );
696
697 assert_eq!(lex.next(), token(Token::String(""), 1));
698 assert_eq!(lex.next(), token(Token::String("0"), 1));
699 assert_eq!(lex.next(), token(Token::String("x3"), 1));
700 assert_eq!(lex.next(), token(Token::String("function"), 1));
701 assert_eq!(lex.next(), token(Token::String("123 abc"), 1));
702 assert_eq!(lex.next(), token(Token::String(r#"\"#), 1));
703 assert_eq!(
704 lex.next(),
705 token(
706 Token::String(
707 r#"start
708 and end on
709 different lines"#
710 ),
711 1
712 )
713 );
714 }
715
716 #[test]
717 fn lex_userrefs() {
718 let mut lex = Lexer::new("u0 u1 u234567890 u9:8765");
719
720 assert_eq!(lex.next(), token(Token::UserRef(0), 1));
721 assert_eq!(lex.next(), token(Token::UserRef(1), 1));
722 assert_eq!(lex.next(), token(Token::UserRef(234567890), 1));
723 assert_eq!(lex.next(), token(Token::UserRef(9), 1));
724 assert_eq!(lex.next(), token(Token::Colon, 1));
725 assert_eq!(lex.next(), token(Token::Integer("8765"), 1));
726 assert_eq!(lex.next(), None);
727 }
728}