CARVIEW |
Select Language
HTTP/2 302
server: nginx
date: Thu, 31 Jul 2025 00:41:07 GMT
content-type: text/plain; charset=utf-8
content-length: 0
x-archive-redirect-reason: found capture at 20210117095621
location: https://web.archive.org/web/20210117095621/https://perl5.git.perl.org/perl5.git/blob/HEAD:/toke.c
server-timing: captures_list;dur=0.465760, exclusion.robots;dur=0.019180, exclusion.robots.policy;dur=0.009520, esindex;dur=0.009829, cdx.remote;dur=16.816717, LoadShardBlock;dur=168.020394, PetaboxLoader3.datanode;dur=128.971161
x-app-server: wwwb-app213
x-ts: 302
x-tr: 210
server-timing: TR;dur=0,Tw;dur=0,Tc;dur=0
set-cookie: SERVER=wwwb-app213; path=/
x-location: All
x-rl: 0
x-na: 0
x-page-cache: MISS
server-timing: MISS
x-nid: DigitalOcean
referrer-policy: no-referrer-when-downgrade
permissions-policy: interest-cohort=()
HTTP/2 200
server: nginx
date: Thu, 31 Jul 2025 00:41:09 GMT
content-type: application/xhtml+xml; charset=utf-8
x-archive-orig-date: Sun, 17 Jan 2021 09:56:20 GMT
x-archive-orig-server: Apache/2.4.6 (CentOS) OpenSSL/1.0.2k-fips
x-archive-orig-keep-alive: timeout=5, max=100
x-archive-orig-connection: Keep-Alive
x-archive-orig-x-crawler-transfer-encoding: chunked
x-archive-orig-content-length: 1048576
x-archive-guessed-content-type: text/html
x-archive-guessed-charset: utf-8
memento-datetime: Sun, 17 Jan 2021 09:56:21 GMT
link: ; rel="original", ; rel="timemap"; type="application/link-format", ; rel="timegate", ; rel="first memento"; datetime="Sun, 17 Jan 2021 09:56:21 GMT", ; rel="memento"; datetime="Sun, 17 Jan 2021 09:56:21 GMT", ; rel="last memento"; datetime="Sun, 17 Jan 2021 09:56:21 GMT"
content-security-policy: default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org web-static.archive.org wayback-api.archive.org athena.archive.org analytics.archive.org pragma.archivelab.org wwwb-events.archive.org
x-archive-src: CC-MAIN-2021-04-1610703511903.11-0026/CC-MAIN-20210117081748-20210117111748-00534.warc.gz
server-timing: captures_list;dur=0.685252, exclusion.robots;dur=0.024889, exclusion.robots.policy;dur=0.011012, esindex;dur=0.012660, cdx.remote;dur=10.983694, LoadShardBlock;dur=76.381324, PetaboxLoader3.datanode;dur=123.846417, load_resource;dur=270.577447, PetaboxLoader3.resolve;dur=142.166125
x-app-server: wwwb-app213
x-ts: 200
x-tr: 1757
server-timing: TR;dur=0,Tw;dur=0,Tc;dur=0
x-location: All
x-rl: 0
x-na: 0
x-page-cache: MISS
server-timing: MISS
x-nid: DigitalOcean
referrer-policy: no-referrer-when-downgrade
permissions-policy: interest-cohort=()
perl5.git.perl.org Git - perl5.git/blob - toke.c
This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
1 /* toke.c
2 *
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
11 /*
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
15 */
17 /*
18 * This file is the lexer for Perl. It's closely linked to the
19 * parser, perly.y.
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
24 /*
25 =head1 Lexer interface
26 This is the lower layer of the Perl parser, managing characters and tokens.
28 =for apidoc AmnU|yy_parser *|PL_parser
30 Pointer to a structure encapsulating the state of the parsing operation
31 currently in progress. The pointer can be locally changed to perform
32 a nested parse without interfering with the state of an outer parse.
33 Individual members of C<PL_parser> have their own documentation.
35 =cut
36 */
38 #include "EXTERN.h"
39 #define PERL_IN_TOKE_C
40 #include "perl.h"
41 #include "invlist_inline.h"
43 #define new_constant(a,b,c,d,e,f,g, h) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
46 #define pl_yylval (PL_parser->yylval)
48 /* XXX temporary backwards compatibility */
49 #define PL_lex_brackets (PL_parser->lex_brackets)
50 #define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51 #define PL_lex_fakeeof (PL_parser->lex_fakeeof)
52 #define PL_lex_brackstack (PL_parser->lex_brackstack)
53 #define PL_lex_casemods (PL_parser->lex_casemods)
54 #define PL_lex_casestack (PL_parser->lex_casestack)
55 #define PL_lex_dojoin (PL_parser->lex_dojoin)
56 #define PL_lex_formbrack (PL_parser->lex_formbrack)
57 #define PL_lex_inpat (PL_parser->lex_inpat)
58 #define PL_lex_inwhat (PL_parser->lex_inwhat)
59 #define PL_lex_op (PL_parser->lex_op)
60 #define PL_lex_repl (PL_parser->lex_repl)
61 #define PL_lex_starts (PL_parser->lex_starts)
62 #define PL_lex_stuff (PL_parser->lex_stuff)
63 #define PL_multi_start (PL_parser->multi_start)
64 #define PL_multi_open (PL_parser->multi_open)
65 #define PL_multi_close (PL_parser->multi_close)
66 #define PL_preambled (PL_parser->preambled)
67 #define PL_linestr (PL_parser->linestr)
68 #define PL_expect (PL_parser->expect)
69 #define PL_copline (PL_parser->copline)
70 #define PL_bufptr (PL_parser->bufptr)
71 #define PL_oldbufptr (PL_parser->oldbufptr)
72 #define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73 #define PL_linestart (PL_parser->linestart)
74 #define PL_bufend (PL_parser->bufend)
75 #define PL_last_uni (PL_parser->last_uni)
76 #define PL_last_lop (PL_parser->last_lop)
77 #define PL_last_lop_op (PL_parser->last_lop_op)
78 #define PL_lex_state (PL_parser->lex_state)
79 #define PL_rsfp (PL_parser->rsfp)
80 #define PL_rsfp_filters (PL_parser->rsfp_filters)
81 #define PL_in_my (PL_parser->in_my)
82 #define PL_in_my_stash (PL_parser->in_my_stash)
83 #define PL_tokenbuf (PL_parser->tokenbuf)
84 #define PL_multi_end (PL_parser->multi_end)
85 #define PL_error_count (PL_parser->error_count)
87 # define PL_nexttoke (PL_parser->nexttoke)
88 # define PL_nexttype (PL_parser->nexttype)
89 # define PL_nextval (PL_parser->nextval)
92 #define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
96 static const char* const ident_too_long = "Identifier too long";
97 static const char* const ident_var_zero_multi_digit = "Numeric variables with more than one digit may not start with '0'";
99 # define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
101 #define XENUMMASK 0x3f
102 #define XFAKEEOF 0x40
103 #define XFAKEBRACK 0x80
105 #ifdef USE_UTF8_SCRIPTS
106 # define UTF cBOOL(!IN_BYTES)
107 #else
108 # define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
109 #endif
111 /* The maximum number of characters preceding the unrecognized one to display */
112 #define UNRECOGNIZED_PRECEDE_COUNT 10
114 /* In variables named $^X, these are the legal values for X.
115 * 1999-02-27 mjd-perl-patch@plover.com */
116 #define isCONTROLVAR(x) (isUPPER(x) || memCHRs("[\\]^_?", (x)))
118 #define SPACE_OR_TAB(c) isBLANK_A(c)
120 #define HEXFP_PEEK(s) \
121 (((s[0] == '.') && \
122 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
123 isALPHA_FOLD_EQ(s[0], 'p'))
125 /* LEX_* are values for PL_lex_state, the state of the lexer.
126 * They are arranged oddly so that the guard on the switch statement
127 * can get by with a single comparison (if the compiler is smart enough).
128 *
129 * These values refer to the various states within a sublex parse,
130 * i.e. within a double quotish string
131 */
133 /* #define LEX_NOTPARSING 11 is done in perl.h. */
135 #define LEX_NORMAL 10 /* normal code (ie not within "...") */
136 #define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
137 #define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
138 #define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
139 #define LEX_INTERPSTART 6 /* expecting the start of a $var */
141 /* at end of code, eg "$x" followed by: */
142 #define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
143 #define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
145 #define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
146 string or after \E, $foo, etc */
147 #define LEX_INTERPCONST 2 /* NOT USED */
148 #define LEX_FORMLINE 1 /* expecting a format line */
150 /* returned to yyl_try() to request it to retry the parse loop, expected to only
151 be returned directly by yyl_fake_eof(), but functions that call yyl_fake_eof()
152 can also return it.
154 yylex (aka Perl_yylex) returns 0 on EOF rather than returning -1,
155 other token values are 258 or higher (see perly.h), so -1 should be
156 a safe value here.
157 */
158 #define YYL_RETRY (-1)
160 #ifdef DEBUGGING
161 static const char* const lex_state_names[] = {
162 "KNOWNEXT",
163 "FORMLINE",
164 "INTERPCONST",
165 "INTERPCONCAT",
166 "INTERPENDMAYBE",
167 "INTERPEND",
168 "INTERPSTART",
169 "INTERPPUSH",
170 "INTERPCASEMOD",
171 "INTERPNORMAL",
172 "NORMAL"
173 };
174 #endif
176 #include "keywords.h"
178 /* CLINE is a macro that ensures PL_copline has a sane value */
180 #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
182 /*
183 * Convenience functions to return different tokens and prime the
184 * lexer for the next token. They all take an argument.
185 *
186 * TOKEN : generic token (used for '(', DOLSHARP, etc)
187 * OPERATOR : generic operator
188 * AOPERATOR : assignment operator
189 * PREBLOCK : beginning the block after an if, while, foreach, ...
190 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
191 * PREREF : *EXPR where EXPR is not a simple identifier
192 * TERM : expression term
193 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
194 * LOOPX : loop exiting command (goto, last, dump, etc)
195 * FTST : file test operator
196 * FUN0 : zero-argument function
197 * FUN0OP : zero-argument function, with its op created in this file
198 * FUN1 : not used, except for not, which isn't a UNIOP
199 * BOop : bitwise or or xor
200 * BAop : bitwise and
201 * BCop : bitwise complement
202 * SHop : shift operator
203 * PWop : power operator
204 * PMop : pattern-matching operator
205 * Aop : addition-level operator
206 * AopNOASSIGN : addition-level operator that is never part of .=
207 * Mop : multiplication-level operator
208 * ChEop : chaining equality-testing operator
209 * NCEop : non-chaining comparison operator at equality precedence
210 * ChRop : chaining relational operator <= != gt
211 * NCRop : non-chaining relational operator isa
212 *
213 * Also see LOP and lop() below.
214 */
216 #ifdef DEBUGGING /* Serve -DT. */
217 # define REPORT(retval) tokereport((I32)retval, &pl_yylval)
218 #else
219 # define REPORT(retval) (retval)
220 #endif
222 #define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
223 #define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
224 #define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
225 #define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
226 #define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
227 #define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
228 #define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
229 #define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
230 #define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
231 pl_yylval.ival=f, \
232 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
233 REPORT((int)LOOPEX))
234 #define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
235 #define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
236 #define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
237 #define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
238 #define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
239 #define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
240 #define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
241 REPORT(PERLY_TILDE)
242 #define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
243 #define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
244 #define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
245 #define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
246 #define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
247 #define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
248 #define ChEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHEQOP))
249 #define NCEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCEQOP))
250 #define ChRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHRELOP))
251 #define NCRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCRELOP))
253 /* This bit of chicanery makes a unary function followed by
254 * a parenthesis into a function with one argument, highest precedence.
255 * The UNIDOR macro is for unary functions that can be followed by the //
256 * operator (such as C<shift // 0>).
257 */
258 #define UNI3(f,x,have_x) { \
259 pl_yylval.ival = f; \
260 if (have_x) PL_expect = x; \
261 PL_bufptr = s; \
262 PL_last_uni = PL_oldbufptr; \
263 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
264 if (*s == '(') \
265 return REPORT( (int)FUNC1 ); \
266 s = skipspace(s); \
267 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
268 }
269 #define UNI(f) UNI3(f,XTERM,1)
270 #define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
271 #define UNIPROTO(f,optional) { \
272 if (optional) PL_last_uni = PL_oldbufptr; \
273 OPERATOR(f); \
274 }
276 #define UNIBRACK(f) UNI3(f,0,0)
278 /* grandfather return to old style */
279 #define OLDLOP(f) \
280 do { \
281 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
282 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
283 pl_yylval.ival = (f); \
284 PL_expect = XTERM; \
285 PL_bufptr = s; \
286 return (int)LSTOP; \
287 } while(0)
289 #define COPLINE_INC_WITH_HERELINES \
290 STMT_START { \
291 CopLINE_inc(PL_curcop); \
292 if (PL_parser->herelines) \
293 CopLINE(PL_curcop) += PL_parser->herelines, \
294 PL_parser->herelines = 0; \
295 } STMT_END
296 /* Called after scan_str to update CopLINE(PL_curcop), but only when there
297 * is no sublex_push to follow. */
298 #define COPLINE_SET_FROM_MULTI_END \
299 STMT_START { \
300 CopLINE_set(PL_curcop, PL_multi_end); \
301 if (PL_multi_end != PL_multi_start) \
302 PL_parser->herelines = 0; \
303 } STMT_END
306 /* A file-local structure for passing around information about subroutines and
307 * related definable words */
308 struct code {
309 SV *sv;
310 CV *cv;
311 GV *gv, **gvp;
312 OP *rv2cv_op;
313 PADOFFSET off;
314 bool lex;
315 };
317 static const struct code no_code = { NULL, NULL, NULL, NULL, NULL, 0, FALSE };
319 #ifdef DEBUGGING
321 /* how to interpret the pl_yylval associated with the token */
322 enum token_type {
323 TOKENTYPE_NONE,
324 TOKENTYPE_IVAL,
325 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
326 TOKENTYPE_PVAL,
327 TOKENTYPE_OPVAL
328 };
330 #define DEBUG_TOKEN(Type, Name) \
331 { Name, TOKENTYPE_##Type, #Name }
333 static struct debug_tokens {
334 const int token;
335 enum token_type type;
336 const char *name;
337 } const debug_tokens[] =
338 {
339 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
340 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
341 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
342 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
343 { ANON_SIGSUB, TOKENTYPE_IVAL, "ANON_SIGSUB" },
344 { ARROW, TOKENTYPE_NONE, "ARROW" },
345 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
346 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
347 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
348 { CHEQOP, TOKENTYPE_OPNUM, "CHEQOP" },
349 { CHRELOP, TOKENTYPE_OPNUM, "CHRELOP" },
350 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
351 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
352 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
353 { DO, TOKENTYPE_NONE, "DO" },
354 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
355 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
356 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
357 { ELSE, TOKENTYPE_NONE, "ELSE" },
358 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
359 { FOR, TOKENTYPE_IVAL, "FOR" },
360 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
361 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
362 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
363 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
364 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
365 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
366 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
367 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
368 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
369 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
370 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
371 { IF, TOKENTYPE_IVAL, "IF" },
372 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
373 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
374 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
375 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
376 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
377 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
378 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
379 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
380 { MY, TOKENTYPE_IVAL, "MY" },
381 { NCEQOP, TOKENTYPE_OPNUM, "NCEQOP" },
382 { NCRELOP, TOKENTYPE_OPNUM, "NCRELOP" },
383 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
384 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
385 { OROP, TOKENTYPE_IVAL, "OROP" },
386 { OROR, TOKENTYPE_NONE, "OROR" },
387 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
388 DEBUG_TOKEN (IVAL, PERLY_AMPERSAND),
389 DEBUG_TOKEN (IVAL, PERLY_BRACE_CLOSE),
390 DEBUG_TOKEN (IVAL, PERLY_BRACE_OPEN),
391 DEBUG_TOKEN (IVAL, PERLY_BRACKET_CLOSE),
392 DEBUG_TOKEN (IVAL, PERLY_BRACKET_OPEN),
393 DEBUG_TOKEN (IVAL, PERLY_COLON),
394 DEBUG_TOKEN (IVAL, PERLY_COMMA),
395 DEBUG_TOKEN (IVAL, PERLY_DOT),
396 DEBUG_TOKEN (IVAL, PERLY_EQUAL_SIGN),
397 DEBUG_TOKEN (IVAL, PERLY_EXCLAMATION_MARK),
398 DEBUG_TOKEN (IVAL, PERLY_MINUS),
399 DEBUG_TOKEN (IVAL, PERLY_PAREN_OPEN),
400 DEBUG_TOKEN (IVAL, PERLY_PERCENT_SIGN),
401 DEBUG_TOKEN (IVAL, PERLY_PLUS),
402 DEBUG_TOKEN (IVAL, PERLY_QUESTION_MARK),
403 DEBUG_TOKEN (IVAL, PERLY_SEMICOLON),
404 DEBUG_TOKEN (IVAL, PERLY_SLASH),
405 DEBUG_TOKEN (IVAL, PERLY_SNAIL),
406 DEBUG_TOKEN (IVAL, PERLY_STAR),
407 DEBUG_TOKEN (IVAL, PERLY_TILDE),
408 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
409 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
410 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
411 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
412 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
413 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
414 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
415 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
416 { PREINC, TOKENTYPE_NONE, "PREINC" },
417 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
418 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
419 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
420 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
421 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
422 { SIGSUB, TOKENTYPE_NONE, "SIGSUB" },
423 { SUB, TOKENTYPE_NONE, "SUB" },
424 { SUBLEXEND, TOKENTYPE_NONE, "SUBLEXEND" },
425 { SUBLEXSTART, TOKENTYPE_NONE, "SUBLEXSTART" },
426 { THING, TOKENTYPE_OPVAL, "THING" },
427 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
428 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
429 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
430 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
431 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
432 { USE, TOKENTYPE_IVAL, "USE" },
433 { WHEN, TOKENTYPE_IVAL, "WHEN" },
434 { WHILE, TOKENTYPE_IVAL, "WHILE" },
435 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
436 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
437 { 0, TOKENTYPE_NONE, NULL }
438 };
440 #undef DEBUG_TOKEN
442 /* dump the returned token in rv, plus any optional arg in pl_yylval */
444 STATIC int
445 S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
446 {
447 PERL_ARGS_ASSERT_TOKEREPORT;
449 if (DEBUG_T_TEST) {
450 const char *name = NULL;
451 enum token_type type = TOKENTYPE_NONE;
452 const struct debug_tokens *p;
453 SV* const report = newSVpvs("<== ");
455 for (p = debug_tokens; p->token; p++) {
456 if (p->token == (int)rv) {
457 name = p->name;
458 type = p->type;
459 break;
460 }
461 }
462 if (name)
463 Perl_sv_catpv(aTHX_ report, name);
464 else if (isGRAPH(rv))
465 {
466 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
467 if ((char)rv == 'p')
468 sv_catpvs(report, " (pending identifier)");
469 }
470 else if (!rv)
471 sv_catpvs(report, "EOF");
472 else
473 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
474 switch (type) {
475 case TOKENTYPE_NONE:
476 break;
477 case TOKENTYPE_IVAL:
478 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
479 break;
480 case TOKENTYPE_OPNUM:
481 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
482 PL_op_name[lvalp->ival]);
483 break;
484 case TOKENTYPE_PVAL:
485 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
486 break;
487 case TOKENTYPE_OPVAL:
488 if (lvalp->opval) {
489 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
490 PL_op_name[lvalp->opval->op_type]);
491 if (lvalp->opval->op_type == OP_CONST) {
492 Perl_sv_catpvf(aTHX_ report, " %s",
493 SvPEEK(cSVOPx_sv(lvalp->opval)));
494 }
496 }
497 else
498 sv_catpvs(report, "(opval=null)");
499 break;
500 }
501 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
502 };
503 return (int)rv;
504 }
507 /* print the buffer with suitable escapes */
509 STATIC void
510 S_printbuf(pTHX_ const char *const fmt, const char *const s)
511 {
512 SV* const tmp = newSVpvs("");
514 PERL_ARGS_ASSERT_PRINTBUF;
516 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
517 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
518 GCC_DIAG_RESTORE_STMT;
519 SvREFCNT_dec(tmp);
520 }
522 #endif
524 /*
525 * S_ao
526 *
527 * This subroutine looks for an '=' next to the operator that has just been
528 * parsed and turns it into an ASSIGNOP if it finds one.
529 */
531 STATIC int
532 S_ao(pTHX_ int toketype)
533 {
534 if (*PL_bufptr == '=') {
535 PL_bufptr++;
536 if (toketype == ANDAND)
537 pl_yylval.ival = OP_ANDASSIGN;
538 else if (toketype == OROR)
539 pl_yylval.ival = OP_ORASSIGN;
540 else if (toketype == DORDOR)
541 pl_yylval.ival = OP_DORASSIGN;
542 toketype = ASSIGNOP;
543 }
544 return REPORT(toketype);
545 }
547 /*
548 * S_no_op
549 * When Perl expects an operator and finds something else, no_op
550 * prints the warning. It always prints "<something> found where
551 * operator expected. It prints "Missing semicolon on previous line?"
552 * if the surprise occurs at the start of the line. "do you need to
553 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
554 * where the compiler doesn't know if foo is a method call or a function.
555 * It prints "Missing operator before end of line" if there's nothing
556 * after the missing operator, or "... before <...>" if there is something
557 * after the missing operator.
558 *
559 * PL_bufptr is expected to point to the start of the thing that was found,
560 * and s after the next token or partial token.
561 */
563 STATIC void
564 S_no_op(pTHX_ const char *const what, char *s)
565 {
566 char * const oldbp = PL_bufptr;
567 const bool is_first = (PL_oldbufptr == PL_linestart);
569 PERL_ARGS_ASSERT_NO_OP;
571 if (!s)
572 s = oldbp;
573 else
574 PL_bufptr = s;
575 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
576 if (ckWARN_d(WARN_SYNTAX)) {
577 if (is_first)
578 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
579 "\t(Missing semicolon on previous line?)\n");
580 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
581 PL_bufend,
582 UTF))
583 {
584 const char *t;
585 for (t = PL_oldoldbufptr;
586 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
587 t += UTF ? UTF8SKIP(t) : 1)
588 {
589 NOOP;
590 }
591 if (t < PL_bufptr && isSPACE(*t))
592 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
593 "\t(Do you need to predeclare %" UTF8f "?)\n",
594 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
595 }
596 else {
597 assert(s >= oldbp);
598 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
599 "\t(Missing operator before %" UTF8f "?)\n",
600 UTF8fARG(UTF, s - oldbp, oldbp));
601 }
602 }
603 PL_bufptr = oldbp;
604 }
606 /*
607 * S_missingterm
608 * Complain about missing quote/regexp/heredoc terminator.
609 * If it's called with NULL then it cauterizes the line buffer.
610 * If we're in a delimited string and the delimiter is a control
611 * character, it's reformatted into a two-char sequence like ^C.
612 * This is fatal.
613 */
615 STATIC void
616 S_missingterm(pTHX_ char *s, STRLEN len)
617 {
618 char tmpbuf[UTF8_MAXBYTES + 1];
619 char q;
620 bool uni = FALSE;
621 SV *sv;
622 if (s) {
623 char * const nl = (char *) my_memrchr(s, '\n', len);
624 if (nl) {
625 *nl = '\0';
626 len = nl - s;
627 }
628 uni = UTF;
629 }
630 else if (PL_multi_close < 32) {
631 *tmpbuf = '^';
632 tmpbuf[1] = (char)toCTRL(PL_multi_close);
633 tmpbuf[2] = '\0';
634 s = tmpbuf;
635 len = 2;
636 }
637 else {
638 if (LIKELY(PL_multi_close < 256)) {
639 *tmpbuf = (char)PL_multi_close;
640 tmpbuf[1] = '\0';
641 len = 1;
642 }
643 else {
644 char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
645 *end = '\0';
646 len = end - tmpbuf;
647 uni = TRUE;
648 }
649 s = tmpbuf;
650 }
651 q = memchr(s, '"', len) ? '\'' : '"';
652 sv = sv_2mortal(newSVpvn(s, len));
653 if (uni)
654 SvUTF8_on(sv);
655 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c"
656 " anywhere before EOF", q, SVfARG(sv), q);
657 }
659 #include "feature.h"
661 /*
662 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
663 * utf16-to-utf8-reversed.
664 */
666 #ifdef PERL_CR_FILTER
667 static void
668 strip_return(SV *sv)
669 {
670 const char *s = SvPVX_const(sv);
671 const char * const e = s + SvCUR(sv);
673 PERL_ARGS_ASSERT_STRIP_RETURN;
675 /* outer loop optimized to do nothing if there are no CR-LFs */
676 while (s < e) {
677 if (*s++ == '\r' && *s == '\n') {
678 /* hit a CR-LF, need to copy the rest */
679 char *d = s - 1;
680 *d++ = *s++;
681 while (s < e) {
682 if (*s == '\r' && s[1] == '\n')
683 s++;
684 *d++ = *s++;
685 }
686 SvCUR(sv) -= s - d;
687 return;
688 }
689 }
690 }
692 STATIC I32
693 S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
694 {
695 const I32 count = FILTER_READ(idx+1, sv, maxlen);
696 if (count > 0 && !maxlen)
697 strip_return(sv);
698 return count;
699 }
700 #endif
702 /*
703 =for apidoc lex_start
705 Creates and initialises a new lexer/parser state object, supplying
706 a context in which to lex and parse from a new source of Perl code.
707 A pointer to the new state object is placed in L</PL_parser>. An entry
708 is made on the save stack so that upon unwinding, the new state object
709 will be destroyed and the former value of L</PL_parser> will be restored.
710 Nothing else need be done to clean up the parsing context.
712 The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
713 non-null, provides a string (in SV form) containing code to be parsed.
714 A copy of the string is made, so subsequent modification of C<line>
715 does not affect parsing. C<rsfp>, if non-null, provides an input stream
716 from which code will be read to be parsed. If both are non-null, the
717 code in C<line> comes first and must consist of complete lines of input,
718 and C<rsfp> supplies the remainder of the source.
720 The C<flags> parameter is reserved for future use. Currently it is only
721 used by perl internally, so extensions should always pass zero.
723 =cut
724 */
726 /* LEX_START_SAME_FILTER indicates that this is not a new file, so it
727 can share filters with the current parser.
728 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
729 caller, hence isn't owned by the parser, so shouldn't be closed on parser
730 destruction. This is used to handle the case of defaulting to reading the
731 script from the standard input because no filename was given on the command
732 line (without getting confused by situation where STDIN has been closed, so
733 the script handle is opened on fd 0) */
735 void
736 Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
737 {
738 const char *s = NULL;
739 yy_parser *parser, *oparser;
741 if (flags && flags & ~LEX_START_FLAGS)
742 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
744 /* create and initialise a parser */
746 Newxz(parser, 1, yy_parser);
747 parser->old_parser = oparser = PL_parser;
748 PL_parser = parser;
750 parser->stack = NULL;
751 parser->stack_max1 = NULL;
752 parser->ps = NULL;
754 /* on scope exit, free this parser and restore any outer one */
755 SAVEPARSER(parser);
756 parser->saved_curcop = PL_curcop;
758 /* initialise lexer state */
760 parser->nexttoke = 0;
761 parser->error_count = oparser ? oparser->error_count : 0;
762 parser->copline = parser->preambling = NOLINE;
763 parser->lex_state = LEX_NORMAL;
764 parser->expect = XSTATE;
765 parser->rsfp = rsfp;
766 parser->recheck_utf8_validity = TRUE;
767 parser->rsfp_filters =
768 !(flags & LEX_START_SAME_FILTER) || !oparser
769 ? NULL
770 : MUTABLE_AV(SvREFCNT_inc(
771 oparser->rsfp_filters
772 ? oparser->rsfp_filters
773 : (oparser->rsfp_filters = newAV())
774 ));
776 Newx(parser->lex_brackstack, 120, char);
777 Newx(parser->lex_casestack, 12, char);
778 *parser->lex_casestack = '\0';
779 Newxz(parser->lex_shared, 1, LEXSHARED);
781 if (line) {
782 STRLEN len;
783 const U8* first_bad_char_loc;
785 s = SvPV_const(line, len);
787 if ( SvUTF8(line)
788 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
789 SvCUR(line),
790 &first_bad_char_loc)))
791 {
792 _force_out_malformed_utf8_message(first_bad_char_loc,
793 (U8 *) s + SvCUR(line),
794 0,
795 1 /* 1 means die */ );
796 NOT_REACHED; /* NOTREACHED */
797 }
799 parser->linestr = flags & LEX_START_COPIED
800 ? SvREFCNT_inc_simple_NN(line)
801 : newSVpvn_flags(s, len, SvUTF8(line));
802 if (!rsfp)
803 sv_catpvs(parser->linestr, "\n;");
804 } else {
805 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
806 }
808 parser->oldoldbufptr =
809 parser->oldbufptr =
810 parser->bufptr =
811 parser->linestart = SvPVX(parser->linestr);
812 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
813 parser->last_lop = parser->last_uni = NULL;
815 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
816 |LEX_DONT_CLOSE_RSFP));
817 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
818 |LEX_DONT_CLOSE_RSFP));
820 parser->in_pod = parser->filtered = 0;
821 }
824 /* delete a parser object */
826 void
827 Perl_parser_free(pTHX_ const yy_parser *parser)
828 {
829 PERL_ARGS_ASSERT_PARSER_FREE;
831 PL_curcop = parser->saved_curcop;
832 SvREFCNT_dec(parser->linestr);
834 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
835 PerlIO_clearerr(parser->rsfp);
836 else if (parser->rsfp && (!parser->old_parser
837 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
838 PerlIO_close(parser->rsfp);
839 SvREFCNT_dec(parser->rsfp_filters);
840 SvREFCNT_dec(parser->lex_stuff);
841 SvREFCNT_dec(parser->lex_sub_repl);
843 Safefree(parser->lex_brackstack);
844 Safefree(parser->lex_casestack);
845 Safefree(parser->lex_shared);
846 PL_parser = parser->old_parser;
847 Safefree(parser);
848 }
850 void
851 Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
852 {
853 I32 nexttoke = parser->nexttoke;
854 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
855 while (nexttoke--) {
856 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
857 && parser->nextval[nexttoke].opval
858 && parser->nextval[nexttoke].opval->op_slabbed
859 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
860 op_free(parser->nextval[nexttoke].opval);
861 parser->nextval[nexttoke].opval = NULL;
862 }
863 }
864 }
867 /*
868 =for apidoc AmnxUN|SV *|PL_parser-E<gt>linestr
870 Buffer scalar containing the chunk currently under consideration of the
871 text currently being lexed. This is always a plain string scalar (for
872 which C<SvPOK> is true). It is not intended to be used as a scalar by
873 normal scalar means; instead refer to the buffer directly by the pointer
874 variables described below.
876 The lexer maintains various C<char*> pointers to things in the
877 C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
878 reallocated, all of these pointers must be updated. Don't attempt to
879 do this manually, but rather use L</lex_grow_linestr> if you need to
880 reallocate the buffer.
882 The content of the text chunk in the buffer is commonly exactly one
883 complete line of input, up to and including a newline terminator,
884 but there are situations where it is otherwise. The octets of the
885 buffer may be intended to be interpreted as either UTF-8 or Latin-1.
886 The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
887 flag on this scalar, which may disagree with it.
889 For direct examination of the buffer, the variable
890 L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
891 lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
892 of these pointers is usually preferable to examination of the scalar
893 through normal scalar means.
895 =for apidoc AmnxUN|char *|PL_parser-E<gt>bufend
897 Direct pointer to the end of the chunk of text currently being lexed, the
898 end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
899 + SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
900 always located at the end of the buffer, and does not count as part of
901 the buffer's contents.
903 =for apidoc AmnxUN|char *|PL_parser-E<gt>bufptr
905 Points to the current position of lexing inside the lexer buffer.
906 Characters around this point may be freely examined, within
907 the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
908 L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
909 interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
911 Lexing code (whether in the Perl core or not) moves this pointer past
912 the characters that it consumes. It is also expected to perform some
913 bookkeeping whenever a newline character is consumed. This movement
914 can be more conveniently performed by the function L</lex_read_to>,
915 which handles newlines appropriately.
917 Interpretation of the buffer's octets can be abstracted out by
918 using the slightly higher-level functions L</lex_peek_unichar> and
919 L</lex_read_unichar>.
921 =for apidoc AmnxUN|char *|PL_parser-E<gt>linestart
923 Points to the start of the current line inside the lexer buffer.
924 This is useful for indicating at which column an error occurred, and
925 not much else. This must be updated by any lexing code that consumes
926 a newline; the function L</lex_read_to> handles this detail.
928 =cut
929 */
931 /*
932 =for apidoc lex_bufutf8
934 Indicates whether the octets in the lexer buffer
935 (L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
936 of Unicode characters. If not, they should be interpreted as Latin-1
937 characters. This is analogous to the C<SvUTF8> flag for scalars.
939 In UTF-8 mode, it is not guaranteed that the lexer buffer actually
940 contains valid UTF-8. Lexing code must be robust in the face of invalid
941 encoding.
943 The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
944 is significant, but not the whole story regarding the input character
945 encoding. Normally, when a file is being read, the scalar contains octets
946 and its C<SvUTF8> flag is off, but the octets should be interpreted as
947 UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
948 however, the scalar may have the C<SvUTF8> flag on, and in this case its
949 octets should be interpreted as UTF-8 unless the C<use bytes> pragma
950 is in effect. This logic may change in the future; use this function
951 instead of implementing the logic yourself.
953 =cut
954 */
956 bool
957 Perl_lex_bufutf8(pTHX)
958 {
959 return UTF;
960 }
962 /*
963 =for apidoc lex_grow_linestr
965 Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
966 at least C<len> octets (including terminating C<NUL>). Returns a
967 pointer to the reallocated buffer. This is necessary before making
968 any direct modification of the buffer that would increase its length.
969 L</lex_stuff_pvn> provides a more convenient way to insert text into
970 the buffer.
972 Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
973 this function updates all of the lexer's variables that point directly
974 into the buffer.
976 =cut
977 */
979 char *
980 Perl_lex_grow_linestr(pTHX_ STRLEN len)
981 {
982 SV *linestr;
983 char *buf;
984 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
985 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
986 bool current;
988 linestr = PL_parser->linestr;
989 buf = SvPVX(linestr);
990 if (len <= SvLEN(linestr))
991 return buf;
993 /* Is the lex_shared linestr SV the same as the current linestr SV?
994 * Only in this case does re_eval_start need adjusting, since it
995 * points within lex_shared->ls_linestr's buffer */
996 current = ( !PL_parser->lex_shared->ls_linestr
997 || linestr == PL_parser->lex_shared->ls_linestr);
999 bufend_pos = PL_parser->bufend - buf;
1000 bufptr_pos = PL_parser->bufptr - buf;
1001 oldbufptr_pos = PL_parser->oldbufptr - buf;
1002 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1003 linestart_pos = PL_parser->linestart - buf;
1004 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1005 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1006 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
1007 PL_parser->lex_shared->re_eval_start - buf : 0;
1009 buf = sv_grow(linestr, len);
1011 PL_parser->bufend = buf + bufend_pos;
1012 PL_parser->bufptr = buf + bufptr_pos;
1013 PL_parser->oldbufptr = buf + oldbufptr_pos;
1014 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1015 PL_parser->linestart = buf + linestart_pos;
1016 if (PL_parser->last_uni)
1017 PL_parser->last_uni = buf + last_uni_pos;
1018 if (PL_parser->last_lop)
1019 PL_parser->last_lop = buf + last_lop_pos;
1020 if (current && PL_parser->lex_shared->re_eval_start)
1021 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
1022 return buf;
1023 }
1025 /*
1026 =for apidoc lex_stuff_pvn
1028 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1029 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1030 reallocating the buffer if necessary. This means that lexing code that
1031 runs later will see the characters as if they had appeared in the input.
1032 It is not recommended to do this as part of normal parsing, and most
1033 uses of this facility run the risk of the inserted characters being
1034 interpreted in an unintended manner.
1036 The string to be inserted is represented by C<len> octets starting
1037 at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1038 according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
1039 The characters are recoded for the lexer buffer, according to how the
1040 buffer is currently being interpreted (L</lex_bufutf8>). If a string
1041 to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
1042 function is more convenient.
1044 =for apidoc Amnh||LEX_STUFF_UTF8
1046 =cut
1047 */
1049 void
1050 Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
1051 {
1052 char *bufptr;
1053 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1054 if (flags & ~(LEX_STUFF_UTF8))
1055 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1056 if (UTF) {
1057 if (flags & LEX_STUFF_UTF8) {
1058 goto plain_copy;
1059 } else {
1060 STRLEN highhalf = variant_under_utf8_count((U8 *) pv,
1061 (U8 *) pv + len);
1062 const char *p, *e = pv+len;;
1063 if (!highhalf)
1064 goto plain_copy;
1065 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1066 bufptr = PL_parser->bufptr;
1067 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
1068 SvCUR_set(PL_parser->linestr,
1069 SvCUR(PL_parser->linestr) + len+highhalf);
1070 PL_parser->bufend += len+highhalf;
1071 for (p = pv; p != e; p++) {
1072 append_utf8_from_native_byte(*p, (U8 **) &bufptr);
1073 }
1074 }
1075 } else {
1076 if (flags & LEX_STUFF_UTF8) {
1077 STRLEN highhalf = 0;
1078 const char *p, *e = pv+len;
1079 for (p = pv; p != e; p++) {
1080 U8 c = (U8)*p;
1081 if (UTF8_IS_ABOVE_LATIN1(c)) {
1082 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1083 "non-Latin-1 character into Latin-1 input");
1084 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
1085 p++;
1086 highhalf++;
1087 } else assert(UTF8_IS_INVARIANT(c));
1088 }
1089 if (!highhalf)
1090 goto plain_copy;
1091 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1092 bufptr = PL_parser->bufptr;
1093 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
1094 SvCUR_set(PL_parser->linestr,
1095 SvCUR(PL_parser->linestr) + len-highhalf);
1096 PL_parser->bufend += len-highhalf;
1097 p = pv;
1098 while (p < e) {
1099 if (UTF8_IS_INVARIANT(*p)) {
1100 *bufptr++ = *p;
1101 p++;
1102 }
1103 else {
1104 assert(p < e -1 );
1105 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
1106 p += 2;
1107 }
1108 }
1109 } else {
1110 plain_copy:
1111 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1112 bufptr = PL_parser->bufptr;
1113 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1114 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
1115 PL_parser->bufend += len;
1116 Copy(pv, bufptr, len, char);
1117 }
1118 }
1119 }
1121 /*
1122 =for apidoc lex_stuff_pv
1124 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1125 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1126 reallocating the buffer if necessary. This means that lexing code that
1127 runs later will see the characters as if they had appeared in the input.
1128 It is not recommended to do this as part of normal parsing, and most
1129 uses of this facility run the risk of the inserted characters being
1130 interpreted in an unintended manner.
1132 The string to be inserted is represented by octets starting at C<pv>
1133 and continuing to the first nul. These octets are interpreted as either
1134 UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1135 in C<flags>. The characters are recoded for the lexer buffer, according
1136 to how the buffer is currently being interpreted (L</lex_bufutf8>).
1137 If it is not convenient to nul-terminate a string to be inserted, the
1138 L</lex_stuff_pvn> function is more appropriate.
1140 =cut
1141 */
1143 void
1144 Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1145 {
1146 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1147 lex_stuff_pvn(pv, strlen(pv), flags);
1148 }
1150 /*
1151 =for apidoc lex_stuff_sv
1153 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1154 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1155 reallocating the buffer if necessary. This means that lexing code that
1156 runs later will see the characters as if they had appeared in the input.
1157 It is not recommended to do this as part of normal parsing, and most
1158 uses of this facility run the risk of the inserted characters being
1159 interpreted in an unintended manner.
1161 The string to be inserted is the string value of C<sv>. The characters
1162 are recoded for the lexer buffer, according to how the buffer is currently
1163 being interpreted (L</lex_bufutf8>). If a string to be inserted is
1164 not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1165 need to construct a scalar.
1167 =cut
1168 */
1170 void
1171 Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1172 {
1173 char *pv;
1174 STRLEN len;
1175 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1176 if (flags)
1177 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1178 pv = SvPV(sv, len);
1179 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1180 }
1182 /*
1183 =for apidoc lex_unstuff
1185 Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1186 C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
1187 This hides the discarded text from any lexing code that runs later,
1188 as if the text had never appeared.
1190 This is not the normal way to consume lexed text. For that, use
1191 L</lex_read_to>.
1193 =cut
1194 */
1196 void
1197 Perl_lex_unstuff(pTHX_ char *ptr)
1198 {
1199 char *buf, *bufend;
1200 STRLEN unstuff_len;
1201 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1202 buf = PL_parser->bufptr;
1203 if (ptr < buf)
1204 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1205 if (ptr == buf)
1206 return;
1207 bufend = PL_parser->bufend;
1208 if (ptr > bufend)
1209 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1210 unstuff_len = ptr - buf;
1211 Move(ptr, buf, bufend+1-ptr, char);
1212 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1213 PL_parser->bufend = bufend - unstuff_len;
1214 }
1216 /*
1217 =for apidoc lex_read_to
1219 Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1220 to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
1221 performing the correct bookkeeping whenever a newline character is passed.
1222 This is the normal way to consume lexed text.
1224 Interpretation of the buffer's octets can be abstracted out by
1225 using the slightly higher-level functions L</lex_peek_unichar> and
1226 L</lex_read_unichar>.
1228 =cut
1229 */
1231 void
1232 Perl_lex_read_to(pTHX_ char *ptr)
1233 {
1234 char *s;
1235 PERL_ARGS_ASSERT_LEX_READ_TO;
1236 s = PL_parser->bufptr;
1237 if (ptr < s || ptr > PL_parser->bufend)
1238 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1239 for (; s != ptr; s++)
1240 if (*s == '\n') {
1241 COPLINE_INC_WITH_HERELINES;
1242 PL_parser->linestart = s+1;
1243 }
1244 PL_parser->bufptr = ptr;
1245 }
1247 /*
1248 =for apidoc lex_discard_to
1250 Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1251 up to C<ptr>. The remaining content of the buffer will be moved, and
1252 all pointers into the buffer updated appropriately. C<ptr> must not
1253 be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1254 it is not permitted to discard text that has yet to be lexed.
1256 Normally it is not necessarily to do this directly, because it suffices to
1257 use the implicit discarding behaviour of L</lex_next_chunk> and things
1258 based on it. However, if a token stretches across multiple lines,
1259 and the lexing code has kept multiple lines of text in the buffer for
1260 that purpose, then after completion of the token it would be wise to
1261 explicitly discard the now-unneeded earlier lines, to avoid future
1262 multi-line tokens growing the buffer without bound.
1264 =cut
1265 */
1267 void
1268 Perl_lex_discard_to(pTHX_ char *ptr)
1269 {
1270 char *buf;
1271 STRLEN discard_len;
1272 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1273 buf = SvPVX(PL_parser->linestr);
1274 if (ptr < buf)
1275 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1276 if (ptr == buf)
1277 return;
1278 if (ptr > PL_parser->bufptr)
1279 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1280 discard_len = ptr - buf;
1281 if (PL_parser->oldbufptr < ptr)
1282 PL_parser->oldbufptr = ptr;
1283 if (PL_parser->oldoldbufptr < ptr)
1284 PL_parser->oldoldbufptr = ptr;
1285 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1286 PL_parser->last_uni = NULL;
1287 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1288 PL_parser->last_lop = NULL;
1289 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1290 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1291 PL_parser->bufend -= discard_len;
1292 PL_parser->bufptr -= discard_len;
1293 PL_parser->oldbufptr -= discard_len;
1294 PL_parser->oldoldbufptr -= discard_len;
1295 if (PL_parser->last_uni)
1296 PL_parser->last_uni -= discard_len;
1297 if (PL_parser->last_lop)
1298 PL_parser->last_lop -= discard_len;
1299 }
1301 void
1302 Perl_notify_parser_that_changed_to_utf8(pTHX)
1303 {
1304 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1305 * off to on. At compile time, this has the effect of entering a 'use
1306 * utf8' section. This means that any input was not previously checked for
1307 * UTF-8 (because it was off), but now we do need to check it, or our
1308 * assumptions about the input being sane could be wrong, and we could
1309 * segfault. This routine just sets a flag so that the next time we look
1310 * at the input we do the well-formed UTF-8 check. If we aren't in the
1311 * proper phase, there may not be a parser object, but if there is, setting
1312 * the flag is harmless */
1314 if (PL_parser) {
1315 PL_parser->recheck_utf8_validity = TRUE;
1316 }
1317 }
1319 /*
1320 =for apidoc lex_next_chunk
1322 Reads in the next chunk of text to be lexed, appending it to
1323 L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1324 looked to the end of the current chunk and wants to know more. It is
1325 usual, but not necessary, for lexing to have consumed the entirety of
1326 the current chunk at this time.
1328 If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1329 chunk (i.e., the current chunk has been entirely consumed), normally the
1330 current chunk will be discarded at the same time that the new chunk is
1331 read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
1332 will not be discarded. If the current chunk has not been entirely
1333 consumed, then it will not be discarded regardless of the flag.
1335 Returns true if some new text was added to the buffer, or false if the
1336 buffer has reached the end of the input text.
1338 =for apidoc Amnh||LEX_KEEP_PREVIOUS
1340 =cut
1341 */
1343 #define LEX_FAKE_EOF 0x80000000
1344 #define LEX_NO_TERM 0x40000000 /* here-doc */
1346 bool
1347 Perl_lex_next_chunk(pTHX_ U32 flags)
1348 {
1349 SV *linestr;
1350 char *buf;
1351 STRLEN old_bufend_pos, new_bufend_pos;
1352 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1353 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
1354 bool got_some_for_debugger = 0;
1355 bool got_some;
1357 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
1358 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
1359 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
1360 return FALSE;
1361 linestr = PL_parser->linestr;
1362 buf = SvPVX(linestr);
1363 if (!(flags & LEX_KEEP_PREVIOUS)
1364 && PL_parser->bufptr == PL_parser->bufend)
1365 {
1366 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1367 linestart_pos = 0;
1368 if (PL_parser->last_uni != PL_parser->bufend)
1369 PL_parser->last_uni = NULL;
1370 if (PL_parser->last_lop != PL_parser->bufend)
1371 PL_parser->last_lop = NULL;
1372 last_uni_pos = last_lop_pos = 0;
1373 *buf = 0;
1374 SvCUR_set(linestr, 0);
1375 } else {
1376 old_bufend_pos = PL_parser->bufend - buf;
1377 bufptr_pos = PL_parser->bufptr - buf;
1378 oldbufptr_pos = PL_parser->oldbufptr - buf;
1379 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1380 linestart_pos = PL_parser->linestart - buf;
1381 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1382 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1383 }
1384 if (flags & LEX_FAKE_EOF) {
1385 goto eof;
1386 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
1387 got_some = 0;
1388 } else if (filter_gets(linestr, old_bufend_pos)) {
1389 got_some = 1;
1390 got_some_for_debugger = 1;
1391 } else if (flags & LEX_NO_TERM) {
1392 got_some = 0;
1393 } else {
1394 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1395 SvPVCLEAR(linestr);
1396 eof:
1397 /* End of real input. Close filehandle (unless it was STDIN),
1398 * then add implicit termination.
1399 */
1400 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
1401 PerlIO_clearerr(PL_parser->rsfp);
1402 else if (PL_parser->rsfp)
1403 (void)PerlIO_close(PL_parser->rsfp);
1404 PL_parser->rsfp = NULL;
1405 PL_parser->in_pod = PL_parser->filtered = 0;
1406 if (!PL_in_eval && PL_minus_p) {
1407 sv_catpvs(linestr,
1408 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1409 PL_minus_n = PL_minus_p = 0;
1410 } else if (!PL_in_eval && PL_minus_n) {
1411 sv_catpvs(linestr, /*{*/";}");
1412 PL_minus_n = 0;
1413 } else
1414 sv_catpvs(linestr, ";");
1415 got_some = 1;
1416 }
1417 buf = SvPVX(linestr);
1418 new_bufend_pos = SvCUR(linestr);
1419 PL_parser->bufend = buf + new_bufend_pos;
1420 PL_parser->bufptr = buf + bufptr_pos;
1422 if (UTF) {
1423 const U8* first_bad_char_loc;
1424 if (UNLIKELY(! is_utf8_string_loc(
1425 (U8 *) PL_parser->bufptr,
1426 PL_parser->bufend - PL_parser->bufptr,
1427 &first_bad_char_loc)))
1428 {
1429 _force_out_malformed_utf8_message(first_bad_char_loc,
1430 (U8 *) PL_parser->bufend,
1431 0,
1432 1 /* 1 means die */ );
1433 NOT_REACHED; /* NOTREACHED */
1434 }
1435 }
1437 PL_parser->oldbufptr = buf + oldbufptr_pos;
1438 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1439 PL_parser->linestart = buf + linestart_pos;
1440 if (PL_parser->last_uni)
1441 PL_parser->last_uni = buf + last_uni_pos;
1442 if (PL_parser->last_lop)
1443 PL_parser->last_lop = buf + last_lop_pos;
1444 if (PL_parser->preambling != NOLINE) {
1445 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1446 PL_parser->preambling = NOLINE;
1447 }
1448 if ( got_some_for_debugger
1449 && PERLDB_LINE_OR_SAVESRC
1450 && PL_curstash != PL_debstash)
1451 {
1452 /* debugger active and we're not compiling the debugger code,
1453 * so store the line into the debugger's array of lines
1454 */
1455 update_debugger_info(NULL, buf+old_bufend_pos,
1456 new_bufend_pos-old_bufend_pos);
1457 }
1458 return got_some;
1459 }
1461 /*
1462 =for apidoc lex_peek_unichar
1464 Looks ahead one (Unicode) character in the text currently being lexed.
1465 Returns the codepoint (unsigned integer value) of the next character,
1466 or -1 if lexing has reached the end of the input text. To consume the
1467 peeked character, use L</lex_read_unichar>.
1469 If the next character is in (or extends into) the next chunk of input
1470 text, the next chunk will be read in. Normally the current chunk will be
1471 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1472 bit set, then the current chunk will not be discarded.
1474 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1475 is encountered, an exception is generated.
1477 =cut
1478 */
1480 I32
1481 Perl_lex_peek_unichar(pTHX_ U32 flags)
1482 {
1483 char *s, *bufend;
1484 if (flags & ~(LEX_KEEP_PREVIOUS))
1485 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1486 s = PL_parser->bufptr;
1487 bufend = PL_parser->bufend;
1488 if (UTF) {
1489 U8 head;
1490 I32 unichar;
1491 STRLEN len, retlen;
1492 if (s == bufend) {
1493 if (!lex_next_chunk(flags))
1494 return -1;
1495 s = PL_parser->bufptr;
1496 bufend = PL_parser->bufend;
1497 }
1498 head = (U8)*s;
1499 if (UTF8_IS_INVARIANT(head))
1500 return head;
1501 if (UTF8_IS_START(head)) {
1502 len = UTF8SKIP(&head);
1503 while ((STRLEN)(bufend-s) < len) {
1504 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1505 break;
1506 s = PL_parser->bufptr;
1507 bufend = PL_parser->bufend;
1508 }
1509 }
1510 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1511 if (retlen == (STRLEN)-1) {
1512 _force_out_malformed_utf8_message((U8 *) s,
1513 (U8 *) bufend,
1514 0,
1515 1 /* 1 means die */ );
1516 NOT_REACHED; /* NOTREACHED */
1517 }
1518 return unichar;
1519 } else {
1520 if (s == bufend) {
1521 if (!lex_next_chunk(flags))
1522 return -1;
1523 s = PL_parser->bufptr;
1524 }
1525 return (U8)*s;
1526 }
1527 }
1529 /*
1530 =for apidoc lex_read_unichar
1532 Reads the next (Unicode) character in the text currently being lexed.
1533 Returns the codepoint (unsigned integer value) of the character read,
1534 and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1535 if lexing has reached the end of the input text. To non-destructively
1536 examine the next character, use L</lex_peek_unichar> instead.
1538 If the next character is in (or extends into) the next chunk of input
1539 text, the next chunk will be read in. Normally the current chunk will be
1540 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1541 bit set, then the current chunk will not be discarded.
1543 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1544 is encountered, an exception is generated.
1546 =cut
1547 */
1549 I32
1550 Perl_lex_read_unichar(pTHX_ U32 flags)
1551 {
1552 I32 c;
1553 if (flags & ~(LEX_KEEP_PREVIOUS))
1554 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1555 c = lex_peek_unichar(flags);
1556 if (c != -1) {
1557 if (c == '\n')
1558 COPLINE_INC_WITH_HERELINES;
1559 if (UTF)
1560 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1561 else
1562 ++(PL_parser->bufptr);
1563 }
1564 return c;
1565 }
1567 /*
1568 =for apidoc lex_read_space
1570 Reads optional spaces, in Perl style, in the text currently being
1571 lexed. The spaces may include ordinary whitespace characters and
1572 Perl-style comments. C<#line> directives are processed if encountered.
1573 L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1574 at a non-space character (or the end of the input text).
1576 If spaces extend into the next chunk of input text, the next chunk will
1577 be read in. Normally the current chunk will be discarded at the same
1578 time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
1579 chunk will not be discarded.
1581 =cut
1582 */
1584 #define LEX_NO_INCLINE 0x40000000
1585 #define LEX_NO_NEXT_CHUNK 0x80000000
1587 void
1588 Perl_lex_read_space(pTHX_ U32 flags)
1589 {
1590 char *s, *bufend;
1591 const bool can_incline = !(flags & LEX_NO_INCLINE);
1592 bool need_incline = 0;
1593 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
1594 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1595 s = PL_parser->bufptr;
1596 bufend = PL_parser->bufend;
1597 while (1) {
1598 char c = *s;
1599 if (c == '#') {
1600 do {
1601 c = *++s;
1602 } while (!(c == '\n' || (c == 0 && s == bufend)));
1603 } else if (c == '\n') {
1604 s++;
1605 if (can_incline) {
1606 PL_parser->linestart = s;
1607 if (s == bufend)
1608 need_incline = 1;
1609 else
1610 incline(s, bufend);
1611 }
1612 } else if (isSPACE(c)) {
1613 s++;
1614 } else if (c == 0 && s == bufend) {
1615 bool got_more;
1616 line_t l;
1617 if (flags & LEX_NO_NEXT_CHUNK)
1618 break;
1619 PL_parser->bufptr = s;
1620 l = CopLINE(PL_curcop);
1621 CopLINE(PL_curcop) += PL_parser->herelines + 1;
1622 got_more = lex_next_chunk(flags);
1623 CopLINE_set(PL_curcop, l);
1624 s = PL_parser->bufptr;
1625 bufend = PL_parser->bufend;
1626 if (!got_more)
1627 break;
1628 if (can_incline && need_incline && PL_parser->rsfp) {
1629 incline(s, bufend);
1630 need_incline = 0;
1631 }
1632 } else if (!c) {
1633 s++;
1634 } else {
1635 break;
1636 }
1637 }
1638 PL_parser->bufptr = s;
1639 }
1641 /*
1643 =for apidoc validate_proto
1645 This function performs syntax checking on a prototype, C<proto>.
1646 If C<warn> is true, any illegal characters or mismatched brackets
1647 will trigger illegalproto warnings, declaring that they were
1648 detected in the prototype for C<name>.
1650 The return value is C<true> if this is a valid prototype, and
1651 C<false> if it is not, regardless of whether C<warn> was C<true> or
1652 C<false>.
1654 Note that C<NULL> is a valid C<proto> and will always return C<true>.
1656 =cut
1658 */
1660 bool
1661 Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
1662 {
1663 STRLEN len, origlen;
1664 char *p;
1665 bool bad_proto = FALSE;
1666 bool in_brackets = FALSE;
1667 bool after_slash = FALSE;
1668 char greedy_proto = ' ';
1669 bool proto_after_greedy_proto = FALSE;
1670 bool must_be_last = FALSE;
1671 bool underscore = FALSE;
1672 bool bad_proto_after_underscore = FALSE;
1674 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1676 if (!proto)
1677 return TRUE;
1679 p = SvPV(proto, len);
1680 origlen = len;
1681 for (; len--; p++) {
1682 if (!isSPACE(*p)) {
1683 if (must_be_last)
1684 proto_after_greedy_proto = TRUE;
1685 if (underscore) {
1686 if (!memCHRs(";@%", *p))
1687 bad_proto_after_underscore = TRUE;
1688 underscore = FALSE;
1689 }
1690 if (!memCHRs("$@%*;[]&\\_+", *p) || *p == '\0') {
1691 bad_proto = TRUE;
1692 }
1693 else {
1694 if (*p == '[')
1695 in_brackets = TRUE;
1696 else if (*p == ']')
1697 in_brackets = FALSE;
1698 else if ((*p == '@' || *p == '%')
1699 && !after_slash
1700 && !in_brackets )
1701 {
1702 must_be_last = TRUE;
1703 greedy_proto = *p;
1704 }
1705 else if (*p == '_')
1706 underscore = TRUE;
1707 }
1708 if (*p == '\\')
1709 after_slash = TRUE;
1710 else
1711 after_slash = FALSE;
1712 }
1713 }
1715 if (warn) {
1716 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
1717 p -= origlen;
1718 p = SvUTF8(proto)
1719 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1720 origlen, UNI_DISPLAY_ISPRINT)
1721 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1723 if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
1724 SV *name2 = sv_2mortal(newSVsv(PL_curstname));
1725 sv_catpvs(name2, "::");
1726 sv_catsv(name2, (SV *)name);
1727 name = name2;
1728 }
1730 if (proto_after_greedy_proto)
1731 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1732 "Prototype after '%c' for %" SVf " : %s",
1733 greedy_proto, SVfARG(name), p);
1734 if (in_brackets)
1735 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1736 "Missing ']' in prototype for %" SVf " : %s",
1737 SVfARG(name), p);
1738 if (bad_proto)
1739 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1740 "Illegal character in prototype for %" SVf " : %s",
1741 SVfARG(name), p);
1742 if (bad_proto_after_underscore)
1743 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1744 "Illegal character after '_' in prototype for %" SVf " : %s",
1745 SVfARG(name), p);
1746 }
1748 return (! (proto_after_greedy_proto || bad_proto) );
1749 }
1751 /*
1752 * S_incline
1753 * This subroutine has nothing to do with tilting, whether at windmills
1754 * or pinball tables. Its name is short for "increment line". It
1755 * increments the current line number in CopLINE(PL_curcop) and checks
1756 * to see whether the line starts with a comment of the form
1757 * # line 500 "foo.pm"
1758 * If so, it sets the current line number and file to the values in the comment.
1759 */
1761 STATIC void
1762 S_incline(pTHX_ const char *s, const char *end)
1763 {
1764 const char *t;
1765 const char *n;
1766 const char *e;
1767 line_t line_num;
1768 UV uv;
1770 PERL_ARGS_ASSERT_INCLINE;
1772 assert(end >= s);
1774 COPLINE_INC_WITH_HERELINES;
1775 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1776 && s+1 == PL_bufend && *s == ';') {
1777 /* fake newline in string eval */
1778 CopLINE_dec(PL_curcop);
1779 return;
1780 }
1781 if (*s++ != '#')
1782 return;
1783 while (SPACE_OR_TAB(*s))
1784 s++;
1785 if (memBEGINs(s, (STRLEN) (end - s), "line"))
1786 s += sizeof("line") - 1;
1787 else
1788 return;
1789 if (SPACE_OR_TAB(*s))
1790 s++;
1791 else
1792 return;
1793 while (SPACE_OR_TAB(*s))
1794 s++;
1795 if (!isDIGIT(*s))
1796 return;
1798 n = s;
1799 while (isDIGIT(*s))
1800 s++;
1801 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
1802 return;
1803 while (SPACE_OR_TAB(*s))
1804 s++;
1805 if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
1806 s++;
1807 e = t + 1;
1808 }
1809 else {
1810 t = s;
1811 while (*t && !isSPACE(*t))
1812 t++;
1813 e = t;
1814 }
1815 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
1816 e++;
1817 if (*e != '\n' && *e != '\0')
1818 return; /* false alarm */
1820 if (!grok_atoUV(n, &uv, &e))
1821 return;
1822 line_num = ((line_t)uv) - 1;
1824 if (t - s > 0) {
1825 const STRLEN len = t - s;
1827 if (!PL_rsfp && !PL_parser->filtered) {
1828 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1829 * to *{"::_<newfilename"} */
1830 /* However, the long form of evals is only turned on by the
1831 debugger - usually they're "(eval %lu)" */
1832 GV * const cfgv = CopFILEGV(PL_curcop);
1833 if (cfgv) {
1834 char smallbuf[128];
1835 STRLEN tmplen2 = len;
1836 char *tmpbuf2;
1837 GV *gv2;
1839 if (tmplen2 + 2 <= sizeof smallbuf)
1840 tmpbuf2 = smallbuf;
1841 else
1842 Newx(tmpbuf2, tmplen2 + 2, char);
1844 tmpbuf2[0] = '_';
1845 tmpbuf2[1] = '<';
1847 memcpy(tmpbuf2 + 2, s, tmplen2);
1848 tmplen2 += 2;
1850 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
1851 if (!isGV(gv2)) {
1852 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
1853 /* adjust ${"::_<newfilename"} to store the new file name */
1854 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
1855 /* The line number may differ. If that is the case,
1856 alias the saved lines that are in the array.
1857 Otherwise alias the whole array. */
1858 if (CopLINE(PL_curcop) == line_num) {
1859 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1860 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
1861 }
1862 else if (GvAV(cfgv)) {
1863 AV * const av = GvAV(cfgv);
1864 const line_t start = CopLINE(PL_curcop)+1;
1865 SSize_t items = AvFILLp(av) - start;
1866 if (items > 0) {
1867 AV * const av2 = GvAVn(gv2);
1868 SV **svp = AvARRAY(av) + start;
1869 Size_t l = line_num+1;
1870 while (items-- && l < SSize_t_MAX && l == (line_t)l)
1871 av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
1872 }
1873 }
1874 }
1876 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
1877 }
1878 }
1879 CopFILE_free(PL_curcop);
1880 CopFILE_setn(PL_curcop, s, len);
1881 }
1882 CopLINE_set(PL_curcop, line_num);
1883 }
1885 STATIC void
1886 S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
1887 {
1888 AV *av = CopFILEAVx(PL_curcop);
1889 if (av) {
1890 SV * sv;
1891 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1892 else {
1893 sv = *av_fetch(av, 0, 1);
1894 SvUPGRADE(sv, SVt_PVMG);
1895 }
1896 if (!SvPOK(sv)) SvPVCLEAR(sv);
1897 if (orig_sv)
1898 sv_catsv(sv, orig_sv);
1899 else
1900 sv_catpvn(sv, buf, len);
1901 if (!SvIOK(sv)) {
1902 (void)SvIOK_on(sv);
1903 SvIV_set(sv, 0);
1904 }
1905 if (PL_parser->preambling == NOLINE)
1906 av_store(av, CopLINE(PL_curcop), sv);
1907 }
1908 }
1910 /*
1911 * skipspace
1912 * Called to gobble the appropriate amount and type of whitespace.
1913 * Skips comments as well.
1914 * Returns the next character after the whitespace that is skipped.
1915 *
1916 * peekspace
1917 * Same thing, but look ahead without incrementing line numbers or
1918 * adjusting PL_linestart.
1919 */
1921 #define skipspace(s) skipspace_flags(s, 0)
1922 #define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
1924 char *
1925 Perl_skipspace_flags(pTHX_ char *s, U32 flags)
1926 {
1927 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
1928 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
1929 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
1930 s++;
1931 } else {
1932 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1933 PL_bufptr = s;
1934 lex_read_space(flags | LEX_KEEP_PREVIOUS |
1935 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
1936 LEX_NO_NEXT_CHUNK : 0));
1937 s = PL_bufptr;
1938 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1939 if (PL_linestart > PL_bufptr)
1940 PL_bufptr = PL_linestart;
1941 return s;
1942 }
1943 return s;
1944 }
1946 /*
1947 * S_check_uni
1948 * Check the unary operators to ensure there's no ambiguity in how they're
1949 * used. An ambiguous piece of code would be:
1950 * rand + 5
1951 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1952 * the +5 is its argument.
1953 */
1955 STATIC void
1956 S_check_uni(pTHX)
1957 {
1958 const char *s;
1960 if (PL_oldoldbufptr != PL_last_uni)
1961 return;
1962 while (isSPACE(*PL_last_uni))
1963 PL_last_uni++;
1964 s = PL_last_uni;
1965 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
1966 s += UTF ? UTF8SKIP(s) : 1;
1967 if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
1968 return;
1970 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1971 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
1972 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
1973 }
1975 /*
1976 * LOP : macro to build a list operator. Its behaviour has been replaced
1977 * with a subroutine, S_lop() for which LOP is just another name.
1978 */
1980 #define LOP(f,x) return lop(f,x,s)
1982 /*
1983 * S_lop
1984 * Build a list operator (or something that might be one). The rules:
1985 * - if we have a next token, then it's a list operator (no parens) for
1986 * which the next token has already been parsed; e.g.,
1987 * sort foo @args
1988 * sort foo (@args)
1989 * - if the next thing is an opening paren, then it's a function
1990 * - else it's a list operator
1991 */
1993 STATIC I32
1994 S_lop(pTHX_ I32 f, U8 x, char *s)
1995 {
1996 PERL_ARGS_ASSERT_LOP;
1998 pl_yylval.ival = f;
1999 CLINE;
2000 PL_bufptr = s;
2001 PL_last_lop = PL_oldbufptr;
2002 PL_last_lop_op = (OPCODE)f;
2003 if (PL_nexttoke)
2004 goto lstop;
2005 PL_expect = x;
2006 if (*s == '(')
2007 return REPORT(FUNC);
2008 s = skipspace(s);
2009 if (*s == '(')
2010 return REPORT(FUNC);
2011 else {
2012 lstop:
2013 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
2014 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
2015 return REPORT(LSTOP);
2016 }
2017 }
2019 /*
2020 * S_force_next
2021 * When the lexer realizes it knows the next token (for instance,
2022 * it is reordering tokens for the parser) then it can call S_force_next
2023 * to know what token to return the next time the lexer is called. Caller
2024 * will need to set PL_nextval[] and possibly PL_expect to ensure
2025 * the lexer handles the token correctly.
2026 */
2028 STATIC void
2029 S_force_next(pTHX_ I32 type)
2030 {
2031 #ifdef DEBUGGING
2032 if (DEBUG_T_TEST) {
2033 PerlIO_printf(Perl_debug_log, "### forced token:\n");
2034 tokereport(type, &NEXTVAL_NEXTTOKE);
2035 }
2036 #endif
2037 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
2038 PL_nexttype[PL_nexttoke] = type;
2039 PL_nexttoke++;
2040 }
2042 /*
2043 * S_postderef
2044 *
2045 * This subroutine handles postfix deref syntax after the arrow has already
2046 * been emitted. @* $* etc. are emitted as two separate tokens right here.
2047 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2048 * only the first, leaving yylex to find the next.
2049 */
2051 static int
2052 S_postderef(pTHX_ int const funny, char const next)
2053 {
2054 assert(funny == DOLSHARP
2055 || memCHRs("$@%&*", funny)
2056 || funny == PERLY_DOLLAR
2057 || funny == PERLY_SNAIL
2058 || funny == PERLY_PERCENT_SIGN
2059 || funny == PERLY_AMPERSAND
2060 || funny == PERLY_STAR
2061 );
2062 if (next == '*') {
2063 PL_expect = XOPERATOR;
2064 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
2065 assert(PERLY_SNAIL == funny || PERLY_DOLLAR == funny || DOLSHARP == funny);
2066 PL_lex_state = LEX_INTERPEND;
2067 if (PERLY_SNAIL == funny)
2068 force_next(POSTJOIN);
2069 }
2070 force_next(PERLY_STAR);
2071 PL_bufptr+=2;
2072 }
2073 else {
2074 if (PERLY_SNAIL == funny && PL_lex_state == LEX_INTERPNORMAL
2075 && !PL_lex_brackets)
2076 PL_lex_dojoin = 2;
2077 PL_expect = XOPERATOR;
2078 PL_bufptr++;
2079 }
2080 return funny;
2081 }
2083 void
2084 Perl_yyunlex(pTHX)
2085 {
2086 int yyc = PL_parser->yychar;
2087 if (yyc != YYEMPTY) {
2088 if (yyc) {
2089 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2090 if (yyc == PERLY_BRACE_OPEN || yyc == HASHBRACK || yyc == PERLY_BRACKET_OPEN) {
2091 PL_lex_allbrackets--;
2092 PL_lex_brackets--;
2093 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2094 } else if (yyc == PERLY_PAREN_OPEN) {
2095 PL_lex_allbrackets--;
2096 yyc |= (2<<24);
2097 }
2098 force_next(yyc);
2099 }
2100 PL_parser->yychar = YYEMPTY;
2101 }
2102 }
2104 STATIC SV *
2105 S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
2106 {
2107 SV * const sv = newSVpvn_utf8(start, len,
2108 ! IN_BYTES
2109 && UTF
2110 && len != 0
2111 && is_utf8_non_invariant_string((const U8*)start, len));
2112 return sv;
2113 }
2115 /*
2116 * S_force_word
2117 * When the lexer knows the next thing is a word (for instance, it has
2118 * just seen -> and it knows that the next char is a word char, then
2119 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2120 * lookahead.
2121 *
2122 * Arguments:
2123 * char *start : buffer position (must be within PL_linestr)
2124 * int token : PL_next* will be this type of bare word
2125 * (e.g., METHOD,BAREWORD)
2126 * int check_keyword : if true, Perl checks to make sure the word isn't
2127 * a keyword (do this if the word is a label, e.g. goto FOO)
2128 * int allow_pack : if true, : characters will also be allowed (require,
2129 * use, etc. do this)
2130 */
2132 STATIC char *
2133 S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
2134 {
2135 char *s;
2136 STRLEN len;
2138 PERL_ARGS_ASSERT_FORCE_WORD;
2140 start = skipspace(start);
2141 s = start;
2142 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
2143 || (allow_pack && *s == ':' && s[1] == ':') )
2144 {
2145 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
2146 if (check_keyword) {
2147 char *s2 = PL_tokenbuf;
2148 STRLEN len2 = len;
2149 if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
2150 s2 += sizeof("CORE::") - 1;
2151 len2 -= sizeof("CORE::") - 1;
2152 }
2153 if (keyword(s2, len2, 0))
2154 return start;
2155 }
2156 if (token == METHOD) {
2157 s = skipspace(s);
2158 if (*s == '(')
2159 PL_expect = XTERM;
2160 else {
2161 PL_expect = XOPERATOR;
2162 }
2163 }
2164 NEXTVAL_NEXTTOKE.opval
2165 = newSVOP(OP_CONST,0,
2166 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
2167 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
2168 force_next(token);
2169 }
2170 return s;
2171 }
2173 /*
2174 * S_force_ident
2175 * Called when the lexer wants $foo *foo &foo etc, but the program
2176 * text only contains the "foo" portion. The first argument is a pointer
2177 * to the "foo", and the second argument is the type symbol to prefix.
2178 * Forces the next token to be a "BAREWORD".
2179 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
2180 */
2182 STATIC void
2183 S_force_ident(pTHX_ const char *s, int kind)
2184 {
2185 PERL_ARGS_ASSERT_FORCE_IDENT;
2187 if (s[0]) {
2188 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
2189 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2190 UTF ? SVf_UTF8 : 0));
2191 NEXTVAL_NEXTTOKE.opval = o;
2192 force_next(BAREWORD);
2193 if (kind) {
2194 o->op_private = OPpCONST_ENTERED;
2195 /* XXX see note in pp_entereval() for why we forgo typo
2196 warnings if the symbol must be introduced in an eval.
2197 GSAR 96-10-12 */
2198 gv_fetchpvn_flags(s, len,
2199 (PL_in_eval ? GV_ADDMULTI
2200 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
2201 kind == PERLY_DOLLAR ? SVt_PV :
2202 kind == PERLY_SNAIL ? SVt_PVAV :
2203 kind == PERLY_PERCENT_SIGN ? SVt_PVHV :
2204 SVt_PVGV
2205 );
2206 }
2207 }
2208 }
2210 static void
2211 S_force_ident_maybe_lex(pTHX_ char pit)
2212 {
2213 NEXTVAL_NEXTTOKE.ival = pit;
2214 force_next('p');
2215 }
2217 NV
2218 Perl_str_to_version(pTHX_ SV *sv)
2219 {
2220 NV retval = 0.0;
2221 NV nshift = 1.0;
2222 STRLEN len;
2223 const char *start = SvPV_const(sv,len);
2224 const char * const end = start + len;
2225 const bool utf = cBOOL(SvUTF8(sv));
2227 PERL_ARGS_ASSERT_STR_TO_VERSION;
2229 while (start < end) {
2230 STRLEN skip;
2231 UV n;
2232 if (utf)
2233 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
2234 else {
2235 n = *(U8*)start;
2236 skip = 1;
2237 }
2238 retval += ((NV)n)/nshift;
2239 start += skip;
2240 nshift *= 1000;
2241 }
2242 return retval;
2243 }
2245 /*
2246 * S_force_version
2247 * Forces the next token to be a version number.
2248 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2249 * and if "guessing" is TRUE, then no new token is created (and the caller
2250 * must use an alternative parsing method).
2251 */
2253 STATIC char *
2254 S_force_version(pTHX_ char *s, int guessing)
2255 {
2256 OP *version = NULL;
2257 char *d;
2259 PERL_ARGS_ASSERT_FORCE_VERSION;
2261 s = skipspace(s);
2263 d = s;
2264 if (*d == 'v')
2265 d++;
2266 if (isDIGIT(*d)) {
2267 while (isDIGIT(*d) || *d == '_' || *d == '.')
2268 d++;
2269 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
2270 SV *ver;
2271 s = scan_num(s, &pl_yylval);
2272 version = pl_yylval.opval;
2273 ver = cSVOPx(version)->op_sv;
2274 if (SvPOK(ver) && !SvNIOK(ver)) {
2275 SvUPGRADE(ver, SVt_PVNV);
2276 SvNV_set(ver, str_to_version(ver));
2277 SvNOK_on(ver); /* hint that it is a version */
2278 }
2279 }
2280 else if (guessing) {
2281 return s;
2282 }
2283 }
2285 /* NOTE: The parser sees the package name and the VERSION swapped */
2286 NEXTVAL_NEXTTOKE.opval = version;
2287 force_next(BAREWORD);
2289 return s;
2290 }
2292 /*
2293 * S_force_strict_version
2294 * Forces the next token to be a version number using strict syntax rules.
2295 */
2297 STATIC char *
2298 S_force_strict_version(pTHX_ char *s)
2299 {
2300 OP *version = NULL;
2301 const char *errstr = NULL;
2303 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2305 while (isSPACE(*s)) /* leading whitespace */
2306 s++;
2308 if (is_STRICT_VERSION(s,&errstr)) {
2309 SV *ver = newSV(0);
2310 s = (char *)scan_version(s, ver, 0);
2311 version = newSVOP(OP_CONST, 0, ver);
2312 }
2313 else if ((*s != ';' && *s != '{' && *s != '}' )
2314 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
2315 {
2316 PL_bufptr = s;
2317 if (errstr)
2318 yyerror(errstr); /* version required */
2319 return s;
2320 }
2322 /* NOTE: The parser sees the package name and the VERSION swapped */
2323 NEXTVAL_NEXTTOKE.opval = version;
2324 force_next(BAREWORD);
2326 return s;
2327 }
2329 /*
2330 * S_tokeq
2331 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2332 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2333 * unchanged, and a new SV containing the modified input is returned.
2334 */
2336 STATIC SV *
2337 S_tokeq(pTHX_ SV *sv)
2338 {
2339 char *s;
2340 char *send;
2341 char *d;
2342 SV *pv = sv;
2344 PERL_ARGS_ASSERT_TOKEQ;
2346 assert (SvPOK(sv));
2347 assert (SvLEN(sv));
2348 assert (!SvIsCOW(sv));
2349 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
2350 goto finish;
2351 s = SvPVX(sv);
2352 send = SvEND(sv);
2353 /* This is relying on the SV being "well formed" with a trailing '\0' */
2354 while (s < send && !(*s == '\\' && s[1] == '\\'))
2355 s++;
2356 if (s == send)
2357 goto finish;
2358 d = s;
2359 if ( PL_hints & HINT_NEW_STRING ) {
2360 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2361 SVs_TEMP | SvUTF8(sv));
2362 }
2363 while (s < send) {
2364 if (*s == '\\') {
2365 if (s + 1 < send && (s[1] == '\\'))
2366 s++; /* all that, just for this */
2367 }
2368 *d++ = *s++;
2369 }
2370 *d = '\0';
2371 SvCUR_set(sv, d - SvPVX_const(sv));
2372 finish:
2373 if ( PL_hints & HINT_NEW_STRING )
2374 return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
2375 return sv;
2376 }
2378 /*
2379 * Now come three functions related to double-quote context,
2380 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2381 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2382 * interact with PL_lex_state, and create fake ( ... ) argument lists
2383 * to handle functions and concatenation.
2384 * For example,
2385 * "foo\lbar"
2386 * is tokenised as
2387 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
2388 */
2390 /*
2391 * S_sublex_start
2392 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
2393 *
2394 * Pattern matching will set PL_lex_op to the pattern-matching op to
2395 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
2396 *
2397 * OP_CONST is easy--just make the new op and return.
2398 *
2399 * Everything else becomes a FUNC.
2400 *
2401 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2402 * had an OP_CONST. This just sets us up for a
2403 * call to S_sublex_push().
2404 */
2406 STATIC I32
2407 S_sublex_start(pTHX)
2408 {
2409 const I32 op_type = pl_yylval.ival;
2411 if (op_type == OP_NULL) {
2412 pl_yylval.opval = PL_lex_op;
2413 PL_lex_op = NULL;
2414 return THING;
2415 }
2416 if (op_type == OP_CONST) {
2417 SV *sv = PL_lex_stuff;
2418 PL_lex_stuff = NULL;
2419 sv = tokeq(sv);
2421 if (SvTYPE(sv) == SVt_PVIV) {
2422 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2423 STRLEN len;
2424 const char * const p = SvPV_const(sv, len);
2425 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
2426 SvREFCNT_dec(sv);
2427 sv = nsv;
2428 }
2429 pl_yylval.opval = newSVOP(op_type, 0, sv);
2430 return THING;
2431 }
2433 PL_parser->lex_super_state = PL_lex_state;
2434 PL_parser->lex_sub_inwhat = (U16)op_type;
2435 PL_parser->lex_sub_op = PL_lex_op;
2436 PL_parser->sub_no_recover = FALSE;
2437 PL_parser->sub_error_count = PL_error_count;
2438 PL_lex_state = LEX_INTERPPUSH;
2440 PL_expect = XTERM;
2441 if (PL_lex_op) {
2442 pl_yylval.opval = PL_lex_op;
2443 PL_lex_op = NULL;
2444 return PMFUNC;
2445 }
2446 else
2447 return FUNC;
2448 }
2450 /*
2451 * S_sublex_push
2452 * Create a new scope to save the lexing state. The scope will be
2453 * ended in S_sublex_done. Returns a '(', starting the function arguments
2454 * to the uc, lc, etc. found before.
2455 * Sets PL_lex_state to LEX_INTERPCONCAT.
2456 */
2458 STATIC I32
2459 S_sublex_push(pTHX)
2460 {
2461 LEXSHARED *shared;
2462 const bool is_heredoc = PL_multi_close == '<';
2463 ENTER;
2465 PL_lex_state = PL_parser->lex_super_state;
2466 SAVEI8(PL_lex_dojoin);
2467 SAVEI32(PL_lex_brackets);
2468 SAVEI32(PL_lex_allbrackets);
2469 SAVEI32(PL_lex_formbrack);
2470 SAVEI8(PL_lex_fakeeof);
2471 SAVEI32(PL_lex_casemods);
2472 SAVEI32(PL_lex_starts);
2473 SAVEI8(PL_lex_state);
2474 SAVESPTR(PL_lex_repl);
2475 SAVEVPTR(PL_lex_inpat);
2476 SAVEI16(PL_lex_inwhat);
2477 if (is_heredoc)
2478 {
2479 SAVECOPLINE(PL_curcop);
2480 SAVEI32(PL_multi_end);
2481 SAVEI32(PL_parser->herelines);
2482 PL_parser->herelines = 0;
2483 }
2484 SAVEIV(PL_multi_close);
2485 SAVEPPTR(PL_bufptr);
2486 SAVEPPTR(PL_bufend);
2487 SAVEPPTR(PL_oldbufptr);
2488 SAVEPPTR(PL_oldoldbufptr);
2489 SAVEPPTR(PL_last_lop);
2490 SAVEPPTR(PL_last_uni);
2491 SAVEPPTR(PL_linestart);
2492 SAVESPTR(PL_linestr);
2493 SAVEGENERICPV(PL_lex_brackstack);
2494 SAVEGENERICPV(PL_lex_casestack);
2495 SAVEGENERICPV(PL_parser->lex_shared);
2496 SAVEBOOL(PL_parser->lex_re_reparsing);
2497 SAVEI32(PL_copline);
2499 /* The here-doc parser needs to be able to peek into outer lexing
2500 scopes to find the body of the here-doc. So we put PL_linestr and
2501 PL_bufptr into lex_shared, to ‘share’ those values.
2502 */
2503 PL_parser->lex_shared->ls_linestr = PL_linestr;
2504 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
2506 PL_linestr = PL_lex_stuff;
2507 PL_lex_repl = PL_parser->lex_sub_repl;
2508 PL_lex_stuff = NULL;
2509 PL_parser->lex_sub_repl = NULL;
2511 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2512 set for an inner quote-like operator and then an error causes scope-
2513 popping. We must not have a PL_lex_stuff value left dangling, as
2514 that breaks assumptions elsewhere. See bug #123617. */
2515 SAVEGENERICSV(PL_lex_stuff);
2516 SAVEGENERICSV(PL_parser->lex_sub_repl);
2518 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2519 = SvPVX(PL_linestr);
2520 PL_bufend += SvCUR(PL_linestr);
2521 PL_last_lop = PL_last_uni = NULL;
2522 SAVEFREESV(PL_linestr);
2523 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
2525 PL_lex_dojoin = FALSE;
2526 PL_lex_brackets = PL_lex_formbrack = 0;
2527 PL_lex_allbrackets = 0;
2528 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2529 Newx(PL_lex_brackstack, 120, char);
2530 Newx(PL_lex_casestack, 12, char);
2531 PL_lex_casemods = 0;
2532 *PL_lex_casestack = '\0';
2533 PL_lex_starts = 0;
2534 PL_lex_state = LEX_INTERPCONCAT;
2535 if (is_heredoc)
2536 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
2537 PL_copline = NOLINE;
2539 Newxz(shared, 1, LEXSHARED);
2540 shared->ls_prev = PL_parser->lex_shared;
2541 PL_parser->lex_shared = shared;
2543 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
2544 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
2545 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2546 PL_lex_inpat = PL_parser->lex_sub_op;
2547 else
2548 PL_lex_inpat = NULL;
2550 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2551 PL_in_eval &= ~EVAL_RE_REPARSING;
2553 return SUBLEXSTART;
2554 }
2556 /*
2557 * S_sublex_done
2558 * Restores lexer state after a S_sublex_push.
2559 */
2561 STATIC I32
2562 S_sublex_done(pTHX)
2563 {
2564 if (!PL_lex_starts++) {
2565 SV * const sv = newSVpvs("");
2566 if (SvUTF8(PL_linestr))
2567 SvUTF8_on(sv);
2568 PL_expect = XOPERATOR;
2569 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
2570 return THING;
2571 }
2573 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2574 PL_lex_state = LEX_INTERPCASEMOD;
2575 return yylex();
2576 }
2578 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
2579 assert(PL_lex_inwhat != OP_TRANSR);
2580 if (PL_lex_repl) {
2581 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
2582 PL_linestr = PL_lex_repl;
2583 PL_lex_inpat = 0;
2584 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2585 PL_bufend += SvCUR(PL_linestr);
2586 PL_last_lop = PL_last_uni = NULL;
2587 PL_lex_dojoin = FALSE;
2588 PL_lex_brackets = 0;
2589 PL_lex_allbrackets = 0;
2590 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2591 PL_lex_casemods = 0;
2592 *PL_lex_casestack = '\0';
2593 PL_lex_starts = 0;
2594 if (SvEVALED(PL_lex_repl)) {
2595 PL_lex_state = LEX_INTERPNORMAL;
2596 PL_lex_starts++;
2597 /* we don't clear PL_lex_repl here, so that we can check later
2598 whether this is an evalled subst; that means we rely on the
2599 logic to ensure sublex_done() is called again only via the
2600 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
2601 }
2602 else {
2603 PL_lex_state = LEX_INTERPCONCAT;
2604 PL_lex_repl = NULL;
2605 }
2606 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2607 CopLINE(PL_curcop) +=
2608 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
2609 + PL_parser->herelines;
2610 PL_parser->herelines = 0;
2611 }
2612 return PERLY_SLASH;
2613 }
2614 else {
2615 const line_t l = CopLINE(PL_curcop);
2616 LEAVE;
2617 if (PL_parser->sub_error_count != PL_error_count) {
2618 if (PL_parser->sub_no_recover) {
2619 yyquit();
2620 NOT_REACHED;
2621 }
2622 }
2623 if (PL_multi_close == '<')
2624 PL_parser->herelines += l - PL_multi_end;
2625 PL_bufend = SvPVX(PL_linestr);
2626 PL_bufend += SvCUR(PL_linestr);
2627 PL_expect = XOPERATOR;
2628 return SUBLEXEND;
2629 }
2630 }
2632 HV *
2633 Perl_load_charnames(pTHX_ SV * char_name, const char * context,
2634 const STRLEN context_len, const char ** error_msg)
2635 {
2636 /* Load the official _charnames module if not already there. The
2637 * parameters are just to give info for any error messages generated:
2638 * char_name a name to look up which is the reason for loading this
2639 * context 'char_name' in the context in the input in which it appears
2640 * context_len how many bytes 'context' occupies
2641 * error_msg *error_msg will be set to any error
2642 *
2643 * Returns the ^H table if success; otherwise NULL */
2645 unsigned int i;
2646 HV * table;
2647 SV **cvp;
2648 SV * res;
2650 PERL_ARGS_ASSERT_LOAD_CHARNAMES;
2652 /* This loop is executed 1 1/2 times. On the first time through, if it
2653 * isn't already loaded, try loading it, and iterate just once to see if it
2654 * worked. */
2655 for (i = 0; i < 2; i++) {
2656 table = GvHV(PL_hintgv); /* ^H */
2658 if ( table
2659 && (PL_hints & HINT_LOCALIZE_HH)
2660 && (cvp = hv_fetchs(table, "charnames", FALSE))
2661 && SvOK(*cvp))
2662 {
2663 return table; /* Quit if already loaded */
2664 }
2666 if (i == 0) {
2667 Perl_load_module(aTHX_
2668 0,
2669 newSVpvs("_charnames"),
2671 /* version parameter; no need to specify it, as if we get too early
2672 * a version, will fail anyway, not being able to find 'charnames'
2673 * */
2674 NULL,
2675 newSVpvs(":full"),
2676 newSVpvs(":short"),
2677 NULL);
2678 }
2679 }
2681 /* Here, it failed; new_constant will give appropriate error messages */
2682 *error_msg = NULL;
2683 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2684 context, context_len, error_msg);
2685 SvREFCNT_dec(res);
2687 return NULL;
2688 }
2690 STATIC SV*
2691 S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
2692 {
2693 /* This justs wraps get_and_check_backslash_N_name() to output any error
2694 * message it returns. */
2696 const char * error_msg = NULL;
2697 SV * result;
2699 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
2701 /* charnames doesn't work well if there have been errors found */
2702 if (PL_error_count > 0) {
2703 return NULL;
2704 }
2706 result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
2708 if (error_msg) {
2709 yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
2710 }
2712 return result;
2713 }
2715 SV*
2716 Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
2717 const char* const e,
2718 const bool is_utf8,
2719 const char ** error_msg)
2720 {
2721 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2722 * interior, hence to the "}". Finds what the name resolves to, returning
2723 * an SV* containing it; NULL if no valid one found.
2724 *
2725 * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
2726 * doesn't have to be. */
2728 SV* char_name;
2729 SV* res;
2730 HV * table;
2731 SV **cvp;
2732 SV *cv;
2733 SV *rv;
2734 HV *stash;
2736 /* Points to the beginning of the \N{... so that any messages include the
2737 * context of what's failing*/
2738 const char* context = s - 3;
2739 STRLEN context_len = e - context + 1; /* include all of \N{...} */
2742 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2744 assert(e >= s);
2745 assert(s > (char *) 3);
2747 char_name = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
2749 if (!SvCUR(char_name)) {
2750 SvREFCNT_dec_NN(char_name);
2751 /* diag_listed_as: Unknown charname '%s' */
2752 *error_msg = Perl_form(aTHX_ "Unknown charname ''");
2753 return NULL;
2754 }
2756 /* Autoload the charnames module */
2758 table = load_charnames(char_name, context, context_len, error_msg);
2759 if (table == NULL) {
2760 return NULL;
2761 }
2763 *error_msg = NULL;
2764 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2765 context, context_len, error_msg);
2766 if (*error_msg) {
2767 *error_msg = Perl_form(aTHX_ "Unknown charname '%s'", SvPVX(char_name));
2769 SvREFCNT_dec(res);
2770 return NULL;
2771 }
2773 /* See if the charnames handler is the Perl core's, and if so, we can skip
2774 * the validation needed for a user-supplied one, as Perl's does its own
2775 * validation. */
2776 cvp = hv_fetchs(table, "charnames", FALSE);
2777 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2778 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
2779 {
2780 const char * const name = HvNAME(stash);
2781 if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
2782 return res;
2783 }
2784 }
2786 /* Here, it isn't Perl's charname handler. We can't rely on a
2787 * user-supplied handler to validate the input name. For non-ut8 input,
2788 * look to see that the first character is legal. Then loop through the
2789 * rest checking that each is a continuation */
2791 /* This code makes the reasonable assumption that the only Latin1-range
2792 * characters that begin a character name alias are alphabetic, otherwise
2793 * would have to create a isCHARNAME_BEGIN macro */
2795 if (! is_utf8) {
2796 if (! isALPHAU(*s)) {
2797 goto bad_charname;
2798 }
2799 s++;
2800 while (s < e) {
2801 if (! isCHARNAME_CONT(*s)) {
2802 goto bad_charname;
2803 }
2804 if (*s == ' ' && *(s-1) == ' ') {
2805 goto multi_spaces;
2806 }
2807 s++;
2808 }
2809 }
2810 else {
2811 /* Similarly for utf8. For invariants can check directly; for other
2812 * Latin1, can calculate their code point and check; otherwise use an
2813 * inversion list */
2814 if (UTF8_IS_INVARIANT(*s)) {
2815 if (! isALPHAU(*s)) {
2816 goto bad_charname;
2817 }
2818 s++;
2819 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2820 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
2821 goto bad_charname;
2822 }
2823 s += 2;
2824 }
2825 else {
2826 if (! _invlist_contains_cp(PL_utf8_charname_begin,
2827 utf8_to_uvchr_buf((U8 *) s,
2828 (U8 *) e,
2829 NULL)))
2830 {
2831 goto bad_charname;
2832 }
2833 s += UTF8SKIP(s);
2834 }
2836 while (s < e) {
2837 if (UTF8_IS_INVARIANT(*s)) {
2838 if (! isCHARNAME_CONT(*s)) {
2839 goto bad_charname;
2840 }
2841 if (*s == ' ' && *(s-1) == ' ') {
2842 goto multi_spaces;
2843 }
2844 s++;
2845 }
2846 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2847 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
2848 {
2849 goto bad_charname;
2850 }
2851 s += 2;
2852 }
2853 else {
2854 if (! _invlist_contains_cp(PL_utf8_charname_continue,
2855 utf8_to_uvchr_buf((U8 *) s,
2856 (U8 *) e,
2857 NULL)))
2858 {
2859 goto bad_charname;
2860 }
2861 s += UTF8SKIP(s);
2862 }
2863 }
2864 }
2865 if (*(s-1) == ' ') {
2866 /* diag_listed_as: charnames alias definitions may not contain
2867 trailing white-space; marked by <-- HERE in %s
2868 */
2869 *error_msg = Perl_form(aTHX_
2870 "charnames alias definitions may not contain trailing "
2871 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2872 (int)(s - context + 1), context,
2873 (int)(e - s + 1), s + 1);
2874 return NULL;
2875 }
2877 if (SvUTF8(res)) { /* Don't accept malformed charname value */
2878 const U8* first_bad_char_loc;
2879 STRLEN len;
2880 const char* const str = SvPV_const(res, len);
2881 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2882 &first_bad_char_loc)))
2883 {
2884 _force_out_malformed_utf8_message(first_bad_char_loc,
2885 (U8 *) PL_parser->bufend,
2886 0,
2887 0 /* 0 means don't die */ );
2888 /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
2889 immediately after '%s' */
2890 *error_msg = Perl_form(aTHX_
2891 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2892 (int) context_len, context,
2893 (int) ((char *) first_bad_char_loc - str), str);
2894 return NULL;
2895 }
2896 }
2898 return res;
2900 bad_charname: {
2902 /* The final %.*s makes sure that should the trailing NUL be missing
2903 * that this print won't run off the end of the string */
2904 /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
2905 in \N{%s} */
2906 *error_msg = Perl_form(aTHX_
2907 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2908 (int)(s - context + 1), context,
2909 (int)(e - s + 1), s + 1);
2910 return NULL;
2911 }
2913 multi_spaces:
2914 /* diag_listed_as: charnames alias definitions may not contain a
2915 sequence of multiple spaces; marked by <-- HERE
2916 in %s */
2917 *error_msg = Perl_form(aTHX_
2918 "charnames alias definitions may not contain a sequence of "
2919 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2920 (int)(s - context + 1), context,
2921 (int)(e - s + 1), s + 1);
2922 return NULL;
2923 }
2925 /*
2926 scan_const
2928 Extracts the next constant part of a pattern, double-quoted string,
2929 or transliteration. This is terrifying code.
2931 For example, in parsing the double-quoted string "ab\x63$d", it would
2932 stop at the '$' and return an OP_CONST containing 'abc'.
2934 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
2935 processing a pattern (PL_lex_inpat is true), a transliteration
2936 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
2938 Returns a pointer to the character scanned up to. If this is
2939 advanced from the start pointer supplied (i.e. if anything was
2940 successfully parsed), will leave an OP_CONST for the substring scanned
2941 in pl_yylval. Caller must intuit reason for not parsing further
2942 by looking at the next characters herself.
2944 In patterns:
2945 expand:
2946 \N{FOO} => \N{U+hex_for_character_FOO}
2947 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
2949 pass through:
2950 all other \-char, including \N and \N{ apart from \N{ABC}
2952 stops on:
2953 @ and $ where it appears to be a var, but not for $ as tail anchor
2954 \l \L \u \U \Q \E
2955 (?{ or (??{
2957 In transliterations:
2958 characters are VERY literal, except for - not at the start or end
2959 of the string, which indicates a range. However some backslash sequences
2960 are recognized: \r, \n, and the like
2961 \007 \o{}, \x{}, \N{}
2962 If all elements in the transliteration are below 256,
2963 scan_const expands the range to the full set of intermediate
2964 characters. If the range is in utf8, the hyphen is replaced with
2965 a certain range mark which will be handled by pmtrans() in op.c.
2967 In double-quoted strings:
2968 backslashes:
2969 all those recognized in transliterations
2970 deprecated backrefs: \1 (in substitution replacements)
2971 case and quoting: \U \Q \E
2972 stops on @ and $
2974 scan_const does *not* construct ops to handle interpolated strings.
2975 It stops processing as soon as it finds an embedded $ or @ variable
2976 and leaves it to the caller to work out what's going on.
2978 embedded arrays (whether in pattern or not) could be:
2979 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2981 $ in double-quoted strings must be the symbol of an embedded scalar.
2983 $ in pattern could be $foo or could be tail anchor. Assumption:
2984 it's a tail anchor if $ is the last thing in the string, or if it's
2985 followed by one of "()| \r\n\t"
2987 \1 (backreferences) are turned into $1 in substitutions
2989 The structure of the code is
2990 while (there's a character to process) {
2991 handle transliteration ranges
2992 skip regexp comments /(?#comment)/ and codes /(?{code})/
2993 skip #-initiated comments in //x patterns
2994 check for embedded arrays
2995 check for embedded scalars
2996 if (backslash) {
2997 deprecate \1 in substitution replacements
2998 handle string-changing backslashes \l \U \Q \E, etc.
2999 switch (what was escaped) {
3000 handle \- in a transliteration (becomes a literal -)
3001 if a pattern and not \N{, go treat as regular character
3002 handle \132 (octal characters)
3003 handle \x15 and \x{1234} (hex characters)
3004 handle \N{name} (named characters, also \N{3,5} in a pattern)
3005 handle \cV (control characters)
3006 handle printf-style backslashes (\f, \r, \n, etc)
3007 } (end switch)
3008 continue
3009 } (end if backslash)
3010 handle regular character
3011 } (end while character to read)
3013 */
3015 STATIC char *
3016 S_scan_const(pTHX_ char *start)
3017 {
3018 char *send = PL_bufend; /* end of the constant */
3019 SV *sv = newSV(send - start); /* sv for the constant. See note below
3020 on sizing. */
3021 char *s = start; /* start of the constant */
3022 char *d = SvPVX(sv); /* destination for copies */
3023 bool dorange = FALSE; /* are we in a translit range? */
3024 bool didrange = FALSE; /* did we just finish a range? */
3025 bool in_charclass = FALSE; /* within /[...]/ */
3026 bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be
3027 UTF8? But, this can show as true
3028 when the source isn't utf8, as for
3029 example when it is entirely composed
3030 of hex constants */
3031 bool d_is_utf8 = FALSE; /* Output constant is UTF8 */
3032 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
3033 number of characters found so far
3034 that will expand (into 2 bytes)
3035 should we have to convert to
3036 UTF-8) */
3037 SV *res; /* result from charnames */
3038 STRLEN offset_to_max = 0; /* The offset in the output to where the range
3039 high-end character is temporarily placed */
3041 /* Does something require special handling in tr/// ? This avoids extra
3042 * work in a less likely case. As such, khw didn't feel it was worth
3043 * adding any branches to the more mainline code to handle this, which
3044 * means that this doesn't get set in some circumstances when things like
3045 * \x{100} get expanded out. As a result there needs to be extra testing
3046 * done in the tr code */
3047 bool has_above_latin1 = FALSE;
3049 /* Note on sizing: The scanned constant is placed into sv, which is
3050 * initialized by newSV() assuming one byte of output for every byte of
3051 * input. This routine expects newSV() to allocate an extra byte for a
3052 * trailing NUL, which this routine will append if it gets to the end of
3053 * the input. There may be more bytes of input than output (eg., \N{LATIN
3054 * CAPITAL LETTER A}), or more output than input if the constant ends up
3055 * recoded to utf8, but each time a construct is found that might increase
3056 * the needed size, SvGROW() is called. Its size parameter each time is
3057 * based on the best guess estimate at the time, namely the length used so
3058 * far, plus the length the current construct will occupy, plus room for
3059 * the trailing NUL, plus one byte for every input byte still unscanned */
3061 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3062 before set */
3063 #ifdef EBCDIC
3064 int backslash_N = 0; /* ? was the character from \N{} */
3065 int non_portable_endpoint = 0; /* ? In a range is an endpoint
3066 platform-specific like \x65 */
3067 #endif
3069 PERL_ARGS_ASSERT_SCAN_CONST;
3071 assert(PL_lex_inwhat != OP_TRANSR);
3073 /* Protect sv from errors and fatal warnings. */
3074 ENTER_with_name("scan_const");
3075 SAVEFREESV(sv);
3077 /* A bunch of code in the loop below assumes that if s[n] exists and is not
3078 * NUL, then s[n+1] exists. This assertion makes sure that assumption is
3079 * valid */
3080 assert(*send == '\0');
3082 while (s < send
3083 || dorange /* Handle tr/// range at right edge of input */
3084 ) {
3086 /* get transliterations out of the way (they're most literal) */
3087 if (PL_lex_inwhat == OP_TRANS) {
3089 /* But there isn't any special handling necessary unless there is a
3090 * range, so for most cases we just drop down and handle the value
3091 * as any other. There are two exceptions.
3092 *
3093 * 1. A hyphen indicates that we are actually going to have a
3094 * range. In this case, skip the '-', set a flag, then drop
3095 * down to handle what should be the end range value.
3096 * 2. After we've handled that value, the next time through, that
3097 * flag is set and we fix up the range.
3098 *
3099 * Ranges entirely within Latin1 are expanded out entirely, in
3100 * order to make the transliteration a simple table look-up.
3101 * Ranges that extend above Latin1 have to be done differently, so
3102 * there is no advantage to expanding them here, so they are
3103 * stored here as Min, RANGE_INDICATOR, Max. 'RANGE_INDICATOR' is
3104 * a byte that can't occur in legal UTF-8, and hence can signify a
3105 * hyphen without any possible ambiguity. On EBCDIC machines, if
3106 * the range is expressed as Unicode, the Latin1 portion is
3107 * expanded out even if the range extends above Latin1. This is
3108 * because each code point in it has to be processed here
3109 * individually to get its native translation */
3111 if (! dorange) {
3113 /* Here, we don't think we're in a range. If the new character
3114 * is not a hyphen; or if it is a hyphen, but it's too close to
3115 * either edge to indicate a range, or if we haven't output any
3116 * characters yet then it's a regular character. */
3117 if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv))
3118 {
3120 /* A regular character. Process like any other, but first
3121 * clear any flags */
3122 didrange = FALSE;
3123 dorange = FALSE;
3124 #ifdef EBCDIC
3125 non_portable_endpoint = 0;
3126 backslash_N = 0;
3127 #endif
3128 /* The tests here for being above Latin1 and similar ones
3129 * in the following 'else' suffice to find all such
3130 * occurences in the constant, except those added by a
3131 * backslash escape sequence, like \x{100}. Mostly, those
3132 * set 'has_above_latin1' as appropriate */
3133 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
3134 has_above_latin1 = TRUE;
3135 }
3137 /* Drops down to generic code to process current byte */
3138 }
3139 else { /* Is a '-' in the context where it means a range */
3140 if (didrange) { /* Something like y/A-C-Z// */
3141 Perl_croak(aTHX_ "Ambiguous range in transliteration"
3142 " operator");
3143 }
3145 dorange = TRUE;
3147 s++; /* Skip past the hyphen */
3149 /* d now points to where the end-range character will be
3150 * placed. Drop down to get that character. We'll finish
3151 * processing the range the next time through the loop */
3153 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
3154 has_above_latin1 = TRUE;
3155 }
3157 /* Drops down to generic code to process current byte */
3158 }
3159 } /* End of not a range */
3160 else {
3161 /* Here we have parsed a range. Now must handle it. At this
3162 * point:
3163 * 'sv' is a SV* that contains the output string we are
3164 * constructing. The final two characters in that string
3165 * are the range start and range end, in order.
3166 * 'd' points to just beyond the range end in the 'sv' string,
3167 * where we would next place something
3168 */
3169 char * max_ptr;
3170 char * min_ptr;
3171 IV range_min;
3172 IV range_max; /* last character in range */
3173 STRLEN grow;
3174 Size_t offset_to_min = 0;
3175 Size_t extras = 0;
3176 #ifdef EBCDIC
3177 bool convert_unicode;
3178 IV real_range_max = 0;
3179 #endif
3180 /* Get the code point values of the range ends. */
3181 max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
3182 offset_to_max = max_ptr - SvPVX_const(sv);
3183 if (d_is_utf8) {
3184 /* We know the utf8 is valid, because we just constructed
3185 * it ourselves in previous loop iterations */
3186 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3187 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3188 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
3190 /* This compensates for not all code setting
3191 * 'has_above_latin1', so that we don't skip stuff that
3192 * should be executed */
3193 if (range_max > 255) {
3194 has_above_latin1 = TRUE;
3195 }
3196 }
3197 else {
3198 min_ptr = max_ptr - 1;
3199 range_min = * (U8*) min_ptr;
3200 range_max = * (U8*) max_ptr;
3201 }
3203 /* If the range is just a single code point, like tr/a-a/.../,
3204 * that code point is already in the output, twice. We can
3205 * just back up over the second instance and avoid all the rest
3206 * of the work. But if it is a variant character, it's been
3207 * counted twice, so decrement. (This unlikely scenario is
3208 * special cased, like the one for a range of 2 code points
3209 * below, only because the main-line code below needs a range
3210 * of 3 or more to work without special casing. Might as well
3211 * get it out of the way now.) */
3212 if (UNLIKELY(range_max == range_min)) {
3213 d = max_ptr;
3214 if (! d_is_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
3215 utf8_variant_count--;
3216 }
3217 goto range_done;
3218 }
3220 #ifdef EBCDIC
3221 /* On EBCDIC platforms, we may have to deal with portable
3222 * ranges. These happen if at least one range endpoint is a
3223 * Unicode value (\N{...}), or if the range is a subset of
3224 * [A-Z] or [a-z], and both ends are literal characters,
3225 * like 'A', and not like \x{C1} */
3226 convert_unicode =
3227 cBOOL(backslash_N) /* \N{} forces Unicode,
3228 hence portable range */
3229 || ( ! non_portable_endpoint
3230 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3231 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
3232 if (convert_unicode) {
3234 /* Special handling is needed for these portable ranges.
3235 * They are defined to be in Unicode terms, which includes
3236 * all the Unicode code points between the end points.
3237 * Convert to Unicode to get the Unicode range. Later we
3238 * will convert each code point in the range back to
3239 * native. */
3240 range_min = NATIVE_TO_UNI(range_min);
3241 range_max = NATIVE_TO_UNI(range_max);
3242 }
3243 #endif
3245 if (range_min > range_max) {
3246 #ifdef EBCDIC
3247 if (convert_unicode) {
3248 /* Need to convert back to native for meaningful
3249 * messages for this platform */
3250 range_min = UNI_TO_NATIVE(range_min);
3251 range_max = UNI_TO_NATIVE(range_max);
3252 }
3253 #endif
3254 /* Use the characters themselves for the error message if
3255 * ASCII printables; otherwise some visible representation
3256 * of them */
3257 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3258 Perl_croak(aTHX_
3259 "Invalid range \"%c-%c\" in transliteration operator",
3260 (char)range_min, (char)range_max);
3261 }
3262 #ifdef EBCDIC
3263 else if (convert_unicode) {
3264 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3265 Perl_croak(aTHX_
3266 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3267 UVXf "}\" in transliteration operator",
3268 range_min, range_max);
3269 }
3270 #endif
3271 else {
3272 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3273 Perl_croak(aTHX_
3274 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3275 " in transliteration operator",
3276 range_min, range_max);
3277 }
3278 }
3280 /* If the range is exactly two code points long, they are
3281 * already both in the output */
3282 if (UNLIKELY(range_min + 1 == range_max)) {
3283 goto range_done;
3284 }
3286 /* Here the range contains at least 3 code points */
3288 if (d_is_utf8) {
3290 /* If everything in the transliteration is below 256, we
3291 * can avoid special handling later. A translation table
3292 * for each of those bytes is created by op.c. So we
3293 * expand out all ranges to their constituent code points.
3294 * But if we've encountered something above 255, the
3295 * expanding won't help, so skip doing that. But if it's
3296 * EBCDIC, we may have to look at each character below 256
3297 * if we have to convert to/from Unicode values */
3298 if ( has_above_latin1
3299 #ifdef EBCDIC
3300 && (range_min > 255 || ! convert_unicode)
3301 #endif
3302 ) {
3303 const STRLEN off = d - SvPVX(sv);
3304 const STRLEN extra = 1 + (send - s) + 1;
3305 char *e;
3307 /* Move the high character one byte to the right; then
3308 * insert between it and the range begin, an illegal
3309 * byte which serves to indicate this is a range (using
3310 * a '-' would be ambiguous). */
3312 if (off + extra > SvLEN(sv)) {
3313 d = off + SvGROW(sv, off + extra);
3314 max_ptr = d - off + offset_to_max;
3315 }
3317 e = d++;
3318 while (e-- > max_ptr) {
3319 *(e + 1) = *e;
3320 }
3321 *(e + 1) = (char) RANGE_INDICATOR;
3322 goto range_done;
3323 }
3325 /* Here, we're going to expand out the range. For EBCDIC
3326 * the range can extend above 255 (not so in ASCII), so
3327 * for EBCDIC, split it into the parts above and below
3328 * 255/256 */
3329 #ifdef EBCDIC
3330 if (range_max > 255) {
3331 real_range_max = range_max;
3332 range_max = 255;
3333 }
3334 #endif
3335 }
3337 /* Here we need to expand out the string to contain each
3338 * character in the range. Grow the output to handle this.
3339 * For non-UTF8, we need a byte for each code point in the
3340 * range, minus the three that we've already allocated for: the
3341 * hyphen, the min, and the max. For UTF-8, we need this
3342 * plus an extra byte for each code point that occupies two
3343 * bytes (is variant) when in UTF-8 (except we've already
3344 * allocated for the end points, including if they are
3345 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3346 * platforms, it's easy to calculate a precise number. To
3347 * start, we count the variants in the range, which we need
3348 * elsewhere in this function anyway. (For the case where it
3349 * isn't easy to calculate, 'extras' has been initialized to 0,
3350 * and the calculation is done in a loop further down.) */
3351 #ifdef EBCDIC
3352 if (convert_unicode)
3353 #endif
3354 {
3355 /* This is executed unconditionally on ASCII, and for
3356 * Unicode ranges on EBCDIC. Under these conditions, all
3357 * code points above a certain value are variant; and none
3358 * under that value are. We just need to find out how much
3359 * of the range is above that value. We don't count the
3360 * end points here, as they will already have been counted
3361 * as they were parsed. */
3362 if (range_min >= UTF_CONTINUATION_MARK) {
3364 /* The whole range is made up of variants */
3365 extras = (range_max - 1) - (range_min + 1) + 1;
3366 }
3367 else if (range_max >= UTF_CONTINUATION_MARK) {
3369 /* Only the higher portion of the range is variants */
3370 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3371 }
3373 utf8_variant_count += extras;
3374 }
3376 /* The base growth is the number of code points in the range,
3377 * not including the endpoints, which have already been sized
3378 * for (and output). We don't subtract for the hyphen, as it
3379 * has been parsed but not output, and the SvGROW below is
3380 * based only on what's been output plus what's left to parse.
3381 * */
3382 grow = (range_max - 1) - (range_min + 1) + 1;
3384 if (d_is_utf8) {
3385 #ifdef EBCDIC
3386 /* In some cases in EBCDIC, we haven't yet calculated a
3387 * precise amount needed for the UTF-8 variants. Just
3388 * assume the worst case, that everything will expand by a
3389 * byte */
3390 if (! convert_unicode) {
3391 grow *= 2;
3392 }
3393 else
3394 #endif
3395 {
3396 /* Otherwise we know exactly how many variants there
3397 * are in the range. */
3398 grow += extras;
3399 }
3400 }
3402 /* Grow, but position the output to overwrite the range min end
3403 * point, because in some cases we overwrite that */
3404 SvCUR_set(sv, d - SvPVX_const(sv));
3405 offset_to_min = min_ptr - SvPVX_const(sv);
3407 /* See Note on sizing above. */
3408 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3409 + (send - s)
3410 + grow
3411 + 1 /* Trailing NUL */ );
3413 /* Now, we can expand out the range. */
3414 #ifdef EBCDIC
3415 if (convert_unicode) {
3416 SSize_t i;
3418 /* Recall that the min and max are now in Unicode terms, so
3419 * we have to convert each character to its native
3420 * equivalent */
3421 if (d_is_utf8) {
3422 for (i = range_min; i <= range_max; i++) {
3423 append_utf8_from_native_byte(
3424 LATIN1_TO_NATIVE((U8) i),
3425 (U8 **) &d);
3426 }
3427 }
3428 else {
3429 for (i = range_min; i <= range_max; i++) {
3430 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
3431 }
3432 }
3433 }
3434 else
3435 #endif
3436 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3437 {
3438 /* Here, no conversions are necessary, which means that the
3439 * first character in the range is already in 'd' and
3440 * valid, so we can skip overwriting it */
3441 if (d_is_utf8) {
3442 SSize_t i;
3443 d += UTF8SKIP(d);
3444 for (i = range_min + 1; i <= range_max; i++) {
3445 append_utf8_from_native_byte((U8) i, (U8 **) &d);
3446 }
3447 }
3448 else {
3449 SSize_t i;
3450 d++;
3451 assert(range_min + 1 <= range_max);
3452 for (i = range_min + 1; i < range_max; i++) {
3453 #ifdef EBCDIC
3454 /* In this case on EBCDIC, we haven't calculated
3455 * the variants. Do it here, as we go along */
3456 if (! UVCHR_IS_INVARIANT(i)) {
3457 utf8_variant_count++;
3458 }
3459 #endif
3460 *d++ = (char)i;
3461 }
3463 /* The range_max is done outside the loop so as to
3464 * avoid having to special case not incrementing
3465 * 'utf8_variant_count' on EBCDIC (it's already been
3466 * counted when originally parsed) */
3467 *d++ = (char) range_max;
3468 }
3469 }
3471 #ifdef EBCDIC
3472 /* If the original range extended above 255, add in that
3473 * portion. */
3474 if (real_range_max) {
3475 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3476 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
3477 if (real_range_max > 0x100) {
3478 if (real_range_max > 0x101) {
3479 *d++ = (char) RANGE_INDICATOR;
3480 }
3481 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
3482 }
3483 }
3484 #endif
3486 range_done:
3487 /* mark the range as done, and continue */
3488 didrange = TRUE;
3489 dorange = FALSE;
3490 #ifdef EBCDIC
3491 non_portable_endpoint = 0;
3492 backslash_N = 0;
3493 #endif
3494 continue;
3495 } /* End of is a range */
3496 } /* End of transliteration. Joins main code after these else's */
3497 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3498 char *s1 = s-1;
3499 int esc = 0;
3500 while (s1 >= start && *s1-- == '\\')
3501 esc = !esc;
3502 if (!esc)
3503 in_charclass = TRUE;
3504 }
3505 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3506 char *s1 = s-1;
3507 int esc = 0;
3508 while (s1 >= start && *s1-- == '\\')
3509 esc = !esc;
3510 if (!esc)
3511 in_charclass = FALSE;
3512 }
3513 /* skip for regexp comments /(?#comment)/, except for the last
3514 * char, which will be done separately. Stop on (?{..}) and
3515 * friends */
3516 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
3517 if (s[2] == '#') {
3518 if (s_is_utf8) {
3519 PERL_UINT_FAST8_T len = UTF8SKIP(s);
3521 while (s + len < send && *s != ')') {
3522 Copy(s, d, len, U8);
3523 d += len;
3524 s += len;
3525 len = UTF8_SAFE_SKIP(s, send);
3526 }
3527 }
3528 else while (s+1 < send && *s != ')') {
3529 *d++ = *s++;
3530 }
3531 }
3532 else if (!PL_lex_casemods
3533 && ( s[2] == '{' /* This should match regcomp.c */
3534 || (s[2] == '?' && s[3] == '{')))
3535 {
3536 break;
3537 }
3538 }
3539 /* likewise skip #-initiated comments in //x patterns */
3540 else if (*s == '#'
3541 && PL_lex_inpat
3542 && !in_charclass
3543 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3544 {
3545 while (s < send && *s != '\n')
3546 *d++ = *s++;
3547 }
3548 /* no further processing of single-quoted regex */
3549 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3550 goto default_action;
3552 /* check for embedded arrays
3553 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3554 */
3555 else if (*s == '@' && s[1]) {
3556 if (UTF
3557 ? isIDFIRST_utf8_safe(s+1, send)
3558 : isWORDCHAR_A(s[1]))
3559 {
3560 break;
3561 }
3562 if (memCHRs(":'{$", s[1]))
3563 break;
3564 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3565 break; /* in regexp, neither @+ nor @- are interpolated */
3566 }
3567 /* check for embedded scalars. only stop if we're sure it's a
3568 * variable. */
3569 else if (*s == '$') {
3570 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
3571 break;
3572 if (s + 1 < send && !memCHRs("()| \r\n\t", s[1])) {
3573 if (s[1] == '\\') {
3574 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3575 "Possible unintended interpolation of $\\ in regex");
3576 }
3577 break; /* in regexp, $ might be tail anchor */
3578 }
3579 }
3581 /* End of else if chain - OP_TRANS rejoin rest */
3583 if (UNLIKELY(s >= send)) {
3584 assert(s == send);
3585 break;
3586 }
3588 /* backslashes */
3589 if (*s == '\\' && s+1 < send) {
3590 char* e; /* Can be used for ending '}', etc. */
3592 s++;
3594 /* warn on \1 - \9 in substitution replacements, but note that \11
3595 * is an octal; and \19 is \1 followed by '9' */
3596 if (PL_lex_inwhat == OP_SUBST
3597 && !PL_lex_inpat
3598 && isDIGIT(*s)
3599 && *s != '0'
3600 && !isDIGIT(s[1]))
3601 {
3602 /* diag_listed_as: \%d better written as $%d */
3603 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
3604 *--s = '$';
3605 break;
3606 }
3608 /* string-change backslash escapes */
3609 if (PL_lex_inwhat != OP_TRANS && *s && memCHRs("lLuUEQF", *s)) {
3610 --s;
3611 break;
3612 }
3613 /* In a pattern, process \N, but skip any other backslash escapes.
3614 * This is because we don't want to translate an escape sequence
3615 * into a meta symbol and have the regex compiler use the meta
3616 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3617 * in spite of this, we do have to process \N here while the proper
3618 * charnames handler is in scope. See bugs #56444 and #62056.
3619 *
3620 * There is a complication because \N in a pattern may also stand
3621 * for 'match a non-nl', and not mean a charname, in which case its
3622 * processing should be deferred to the regex compiler. To be a
3623 * charname it must be followed immediately by a '{', and not look
3624 * like \N followed by a curly quantifier, i.e., not something like
3625 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3626 * quantifier */
3627 else if (PL_lex_inpat
3628 && (*s != 'N'
3629 || s[1] != '{'
3630 || regcurly(s + 1)))
3631 {
3632 *d++ = '\\';
3633 goto default_action;
3634 }
3636 switch (*s) {
3637 default:
3638 {
3639 if ((isALPHANUMERIC(*s)))
3640 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3641 "Unrecognized escape \\%c passed through",
3642 *s);
3643 /* default action is to copy the quoted character */
3644 goto default_action;
3645 }
3647 /* eg. \132 indicates the octal constant 0132 */
3648 case '0': case '1': case '2': case '3':
3649 case '4': case '5': case '6': case '7':
3650 {
3651 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3652 | PERL_SCAN_NOTIFY_ILLDIGIT;
3653 STRLEN len = 3;
3654 uv = grok_oct(s, &len, &flags, NULL);
3655 s += len;
3656 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
3657 && s < send
3658 && isDIGIT(*s) /* like \08, \178 */
3659 && ckWARN(WARN_MISC))
3660 {
3661 Perl_warner(aTHX_ packWARN(WARN_MISC), "%s",
3662 form_alien_digit_msg(8, len, s, send, UTF, FALSE));
3663 }
3664 }
3665 goto NUM_ESCAPE_INSERT;
3667 /* eg. \o{24} indicates the octal constant \024 */
3668 case 'o':
3669 {
3670 const char* error;
3672 if (! grok_bslash_o(&s, send,
3673 &uv, &error,
3674 NULL,
3675 FALSE, /* Not strict */
3676 FALSE, /* No illegal cp's */
3677 UTF))
3678 {
3679 yyerror(error);
3680 uv = 0; /* drop through to ensure range ends are set */
3681 }
3682 goto NUM_ESCAPE_INSERT;
3683 }
3685 /* eg. \x24 indicates the hex constant 0x24 */
3686 case 'x':
3687 {
3688 const char* error;
3690 if (! grok_bslash_x(&s, send,
3691 &uv, &error,
3692 NULL,
3693 FALSE, /* Not strict */
3694 FALSE, /* No illegal cp's */
3695 UTF))
3696 {
3697 yyerror(error);
3698 uv = 0; /* drop through to ensure range ends are set */
3699 }
3700 }
3702 NUM_ESCAPE_INSERT:
3703 /* Insert oct or hex escaped character. */
3705 /* Here uv is the ordinal of the next character being added */
3706 if (UVCHR_IS_INVARIANT(uv)) {
3707 *d++ = (char) uv;
3708 }
3709 else {
3710 if (!d_is_utf8 && uv > 255) {
3712 /* Here, 'uv' won't fit unless we convert to UTF-8.
3713 * If we've only seen invariants so far, all we have to
3714 * do is turn on the flag */
3715 if (utf8_variant_count == 0) {
3716 SvUTF8_on(sv);
3717 }
3718 else {
3719 SvCUR_set(sv, d - SvPVX_const(sv));
3720 SvPOK_on(sv);
3721 *d = '\0';
3723 sv_utf8_upgrade_flags_grow(
3724 sv,
3725 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3727 /* Since we're having to grow here,
3728 * make sure we have enough room for
3729 * this escape and a NUL, so the
3730 * code immediately below won't have
3731 * to actually grow again */
3732 UVCHR_SKIP(uv)
3733 + (STRLEN)(send - s) + 1);
3734 d = SvPVX(sv) + SvCUR(sv);
3735 }
3737 has_above_latin1 = TRUE;
3738 d_is_utf8 = TRUE;
3739 }
3741 if (! d_is_utf8) {
3742 *d++ = (char)uv;
3743 utf8_variant_count++;
3744 }
3745 else {
3746 /* Usually, there will already be enough room in 'sv'
3747 * since such escapes are likely longer than any UTF-8
3748 * sequence they can end up as. This isn't the case on
3749 * EBCDIC where \x{40000000} contains 12 bytes, and the
3750 * UTF-8 for it contains 14. And, we have to allow for
3751 * a trailing NUL. It probably can't happen on ASCII
3752 * platforms, but be safe. See Note on sizing above. */
3753 const STRLEN needed = d - SvPVX(sv)
3754 + UVCHR_SKIP(uv)
3755 + (send - s)
3756 + 1;
3757 if (UNLIKELY(needed > SvLEN(sv))) {
3758 SvCUR_set(sv, d - SvPVX_const(sv));
3759 d = SvCUR(sv) + SvGROW(sv, needed);
3760 }
3762 d = (char*) uvchr_to_utf8_flags((U8*)d, uv,
3763 (ckWARN(WARN_PORTABLE))
3764 ? UNICODE_WARN_PERL_EXTENDED
3765 : 0);
3766 }
3767 }
3768 #ifdef EBCDIC
3769 non_portable_endpoint++;
3770 #endif
3771 continue;
3773 case 'N':
3774 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3775 * named character, like \N{LATIN SMALL LETTER A}, or a named
3776 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
3777 * GRAVE} (except y/// can't handle the latter, croaking). For
3778 * convenience all three forms are referred to as "named
3779 * characters" below.
3780 *
3781 * For patterns, \N also can mean to match a non-newline. Code
3782 * before this 'switch' statement should already have handled
3783 * this situation, and hence this code only has to deal with
3784 * the named character cases.
3785 *
3786 * For non-patterns, the named characters are converted to
3787 * their string equivalents. In patterns, named characters are
3788 * not converted to their ultimate forms for the same reasons
3789 * that other escapes aren't (mainly that the ultimate
3790 * character could be considered a meta-symbol by the regex
3791 * compiler). Instead, they are converted to the \N{U+...}
3792 * form to get the value from the charnames that is in effect
3793 * right now, while preserving the fact that it was a named
3794 * character, so that the regex compiler knows this.
3795 *
3796 * The structure of this section of code (besides checking for
3797 * errors and upgrading to utf8) is:
3798 * If the named character is of the form \N{U+...}, pass it
3799 * through if a pattern; otherwise convert the code point
3800 * to utf8
3801 * Otherwise must be some \N{NAME}: convert to
3802 * \N{U+c1.c2...} if a pattern; otherwise convert to utf8
3803 *
3804 * Transliteration is an exception. The conversion to utf8 is
3805 * only done if the code point requires it to be representable.
3806 *
3807 * Here, 's' points to the 'N'; the test below is guaranteed to
3808 * succeed if we are being called on a pattern, as we already
3809 * know from a test above that the next character is a '{'. A
3810 * non-pattern \N must mean 'named character', which requires
3811 * braces */
3812 s++;
3813 if (*s != '{') {
3814 yyerror("Missing braces on \\N{}");
3815 *d++ = '\0';
3816 continue;
3817 }
3818 s++;
3820 /* If there is no matching '}', it is an error. */
3821 if (! (e = (char *) memchr(s, '}', send - s))) {
3822 if (! PL_lex_inpat) {
3823 yyerror("Missing right brace on \\N{}");
3824 } else {
3825 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
3826 }
3827 yyquit(); /* Have exhausted the input. */
3828 }
3830 /* Here it looks like a named character */
3832 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3833 s += 2; /* Skip to next char after the 'U+' */
3834 if (PL_lex_inpat) {
3836 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
3837 /* Check the syntax. */
3838 const char *orig_s;
3839 orig_s = s - 5;
3840 if (!isXDIGIT(*s)) {
3841 bad_NU:
3842 yyerror(
3843 "Invalid hexadecimal number in \\N{U+...}"
3844 );
3845 s = e + 1;
3846 *d++ = '\0';
3847 continue;
3848 }
3849 while (++s < e) {
3850 if (isXDIGIT(*s))
3851 continue;
3852 else if ((*s == '.' || *s == '_')
3853 && isXDIGIT(s[1]))
3854 continue;
3855 goto bad_NU;
3856 }
3858 /* Pass everything through unchanged.
3859 * +1 is for the '}' */
3860 Copy(orig_s, d, e - orig_s + 1, char);
3861 d += e - orig_s + 1;
3862 }
3863 else { /* Not a pattern: convert the hex to string */
3864 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3865 | PERL_SCAN_SILENT_ILLDIGIT
3866 | PERL_SCAN_SILENT_OVERFLOW
3867 | PERL_SCAN_DISALLOW_PREFIX;
3868 STRLEN len = e - s;
3870 uv = grok_hex(s, &len, &flags, NULL);
3871 if (len == 0 || (len != (STRLEN)(e - s)))
3872 goto bad_NU;
3874 if ( uv > MAX_LEGAL_CP
3875 || (flags & PERL_SCAN_GREATER_THAN_UV_MAX))
3876 {
3877 yyerror(form_cp_too_large_msg(16, s, len, 0));
3878 uv = 0; /* drop through to ensure range ends are
3879 set */
3880 }
3882 /* For non-tr///, if the destination is not in utf8,
3883 * unconditionally recode it to be so. This is
3884 * because \N{} implies Unicode semantics, and scalars
3885 * have to be in utf8 to guarantee those semantics.
3886 * tr/// doesn't care about Unicode rules, so no need
3887 * there to upgrade to UTF-8 for small enough code
3888 * points */
3889 if (! d_is_utf8 && ( uv > 0xFF
3890 || PL_lex_inwhat != OP_TRANS))
3891 {
3892 /* See Note on sizing above. */
3893 const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
3895 SvCUR_set(sv, d - SvPVX_const(sv));
3896 SvPOK_on(sv);
3897 *d = '\0';
3899 if (utf8_variant_count == 0) {
3900 SvUTF8_on(sv);
3901 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3902 }
3903 else {
3904 sv_utf8_upgrade_flags_grow(
3905 sv,
3906 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3907 extra);
3908 d = SvPVX(sv) + SvCUR(sv);
3909 }
3911 d_is_utf8 = TRUE;
3912 has_above_latin1 = TRUE;
3913 }
3915 /* Add the (Unicode) code point to the output. */
3916 if (! d_is_utf8 || OFFUNI_IS_INVARIANT(uv)) {
3917 *d++ = (char) LATIN1_TO_NATIVE(uv);
3918 }
3919 else {
3920 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv,
3921 (ckWARN(WARN_PORTABLE))
3922 ? UNICODE_WARN_PERL_EXTENDED
3923 : 0);
3924 }
3925 }
3926 }
3927 else /* Here is \N{NAME} but not \N{U+...}. */
3928 if (! (res = get_and_check_backslash_N_name_wrapper(s, e)))
3929 { /* Failed. We should die eventually, but for now use a NUL
3930 to keep parsing */
3931 *d++ = '\0';
3932 }
3933 else { /* Successfully evaluated the name */
3934 STRLEN len;
3935 const char *str = SvPV_const(res, len);
3936 if (PL_lex_inpat) {
3938 if (! len) { /* The name resolved to an empty string */
3939 const char empty_N[] = "\\N{_}";
3940 Copy(empty_N, d, sizeof(empty_N) - 1, char);
3941 d += sizeof(empty_N) - 1;
3942 }
3943 else {
3944 /* In order to not lose information for the regex
3945 * compiler, pass the result in the specially made
3946 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3947 * the code points in hex of each character
3948 * returned by charnames */
3950 const char *str_end = str + len;
3951 const STRLEN off = d - SvPVX_const(sv);
3953 if (! SvUTF8(res)) {
3954 /* For the non-UTF-8 case, we can determine the
3955 * exact length needed without having to parse
3956 * through the string. Each character takes up
3957 * 2 hex digits plus either a trailing dot or
3958 * the "}" */
3959 const char initial_text[] = "\\N{U+";
3960 const STRLEN initial_len = sizeof(initial_text)
3961 - 1;
3962 d = off + SvGROW(sv, off
3963 + 3 * len
3965 /* +1 for trailing NUL */
3966 + initial_len + 1
3968 + (STRLEN)(send - e));
3969 Copy(initial_text, d, initial_len, char);
3970 d += initial_len;
3971 while (str < str_end) {
3972 char hex_string[4];
3973 int len =
3974 my_snprintf(hex_string,
3975 sizeof(hex_string),
3976 "%02X.",
3978 /* The regex compiler is
3979 * expecting Unicode, not
3980 * native */
3981 NATIVE_TO_LATIN1(*str));
3982 PERL_MY_SNPRINTF_POST_GUARD(len,
3983 sizeof(hex_string));
3984 Copy(hex_string, d, 3, char);
3985 d += 3;
3986 str++;
3987 }
3988 d--; /* Below, we will overwrite the final
3989 dot with a right brace */
3990 }
3991 else {
3992 STRLEN char_length; /* cur char's byte length */
3994 /* and the number of bytes after this is
3995 * translated into hex digits */
3996 STRLEN output_length;
3998 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3999 * for max('U+', '.'); and 1 for NUL */
4000 char hex_string[2 * UTF8_MAXBYTES + 5];
4002 /* Get the first character of the result. */
4003 U32 uv = utf8n_to_uvchr((U8 *) str,
4004 len,
4005 &char_length,
4006 UTF8_ALLOW_ANYUV);
4007 /* Convert first code point to Unicode hex,
4008 * including the boiler plate before it. */
4009 output_length =
4010 my_snprintf(hex_string, sizeof(hex_string),
4011 "\\N{U+%X",
4012 (unsigned int) NATIVE_TO_UNI(uv));
4014 /* Make sure there is enough space to hold it */
4015 d = off + SvGROW(sv, off
4016 + output_length
4017 + (STRLEN)(send - e)
4018 + 2); /* '}' + NUL */
4019 /* And output it */
4020 Copy(hex_string, d, output_length, char);
4021 d += output_length;
4023 /* For each subsequent character, append dot and
4024 * its Unicode code point in hex */
4025 while ((str += char_length) < str_end) {
4026 const STRLEN off = d - SvPVX_const(sv);
4027 U32 uv = utf8n_to_uvchr((U8 *) str,
4028 str_end - str,
4029 &char_length,
4030 UTF8_ALLOW_ANYUV);
4031 output_length =
4032 my_snprintf(hex_string,
4033 sizeof(hex_string),
4034 ".%X",
4035 (unsigned int) NATIVE_TO_UNI(uv));
4037 d = off + SvGROW(sv, off
4038 + output_length
4039 + (STRLEN)(send - e)
4040 + 2); /* '}' + NUL */
4041 Copy(hex_string, d, output_length, char);
4042 d += output_length;
4043 }
4044 }
4046 *d++ = '}'; /* Done. Add the trailing brace */
4047 }
4048 }
4049 else { /* Here, not in a pattern. Convert the name to a
4050 * string. */
4052 if (PL_lex_inwhat == OP_TRANS) {
4053 str = SvPV_const(res, len);
4054 if (len > ((SvUTF8(res))
4055 ? UTF8SKIP(str)
4056 : 1U))
4057 {
4058 yyerror(Perl_form(aTHX_
4059 "%.*s must not be a named sequence"
4060 " in transliteration operator",
4061 /* +1 to include the "}" */
4062 (int) (e + 1 - start), start));
4063 *d++ = '\0';
4064 goto end_backslash_N;
4065 }
4067 if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) {
4068 has_above_latin1 = TRUE;
4069 }
4071 }
4072 else if (! SvUTF8(res)) {
4073 /* Make sure \N{} return is UTF-8. This is because
4074 * \N{} implies Unicode semantics, and scalars have
4075 * to be in utf8 to guarantee those semantics; but
4076 * not needed in tr/// */
4077 sv_utf8_upgrade_flags(res, 0);
4078 str = SvPV_const(res, len);
4079 }
4081 /* Upgrade destination to be utf8 if this new
4082 * component is */
4083 if (! d_is_utf8 && SvUTF8(res)) {
4084 /* See Note on sizing above. */
4085 const STRLEN extra = len + (send - s) + 1;
4087 SvCUR_set(sv, d - SvPVX_const(sv));
4088 SvPOK_on(sv);
4089 *d = '\0';
4091 if (utf8_variant_count == 0) {
4092 SvUTF8_on(sv);
4093 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
4094 }
4095 else {
4096 sv_utf8_upgrade_flags_grow(sv,
4097 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
4098 extra);
4099 d = SvPVX(sv) + SvCUR(sv);
4100 }
4101 d_is_utf8 = TRUE;
4102 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
4104 /* See Note on sizing above. (NOTE: SvCUR() is not
4105 * set correctly here). */
4106 const STRLEN extra = len + (send - e) + 1;
4107 const STRLEN off = d - SvPVX_const(sv);
4108 d = off + SvGROW(sv, off + extra);
4109 }
4110 Copy(str, d, len, char);
4111 d += len;
4112 }
4114 SvREFCNT_dec(res);
4116 } /* End \N{NAME} */
4118 end_backslash_N:
4119 #ifdef EBCDIC
4120 backslash_N++; /* \N{} is defined to be Unicode */
4121 #endif
4122 s = e + 1; /* Point to just after the '}' */
4123 continue;
4125 /* \c is a control character */
4126 case 'c':
4127 s++;
4128 if (s < send) {
4129 const char * message;
4131 if (! grok_bslash_c(*s, (U8 *) d, &message, NULL)) {
4132 yyerror(message);
4133 yyquit(); /* Have always immediately croaked on
4134 errors in this */
4135 }
4136 d++;
4137 }
4138 else {
4139 yyerror("Missing control char name in \\c");
4140 yyquit(); /* Are at end of input, no sense continuing */
4141 }
4142 #ifdef EBCDIC
4143 non_portable_endpoint++;
4144 #endif
4145 break;
4147 /* printf-style backslashes, formfeeds, newlines, etc */
4148 case 'b':
4149 *d++ = '\b';
4150 break;
4151 case 'n':
4152 *d++ = '\n';
4153 break;
4154 case 'r':
4155 *d++ = '\r';
4156 break;
4157 case 'f':
4158 *d++ = '\f';
4159 break;
4160 case 't':
4161 *d++ = '\t';
4162 break;
4163 case 'e':
4164 *d++ = ESC_NATIVE;
4165 break;
4166 case 'a':
4167 *d++ = '\a';
4168 break;
4169 } /* end switch */
4171 s++;
4172 continue;
4173 } /* end if (backslash) */
4175 default_action:
4176 /* Just copy the input to the output, though we may have to convert
4177 * to/from UTF-8.
4178 *
4179 * If the input has the same representation in UTF-8 as not, it will be
4180 * a single byte, and we don't care about UTF8ness; just copy the byte */
4181 if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
4182 *d++ = *s++;
4183 }
4184 else if (! s_is_utf8 && ! d_is_utf8) {
4185 /* If neither source nor output is UTF-8, is also a single byte,
4186 * just copy it; but this byte counts should we later have to
4187 * convert to UTF-8 */
4188 *d++ = *s++;
4189 utf8_variant_count++;
4190 }
4191 else if (s_is_utf8 && d_is_utf8) { /* Both UTF-8, can just copy */
4192 const STRLEN len = UTF8SKIP(s);
4194 /* We expect the source to have already been checked for
4195 * malformedness */
4196 assert(isUTF8_CHAR((U8 *) s, (U8 *) send));
4198 Copy(s, d, len, U8);
4199 d += len;
4200 s += len;
4201 }
4202 else if (s_is_utf8) { /* UTF8ness matters: convert output to utf8 */
4203 STRLEN need = send - s + 1; /* See Note on sizing above. */
4205 SvCUR_set(sv, d - SvPVX_const(sv));
4206 SvPOK_on(sv);
4207 *d = '\0';
4209 if (utf8_variant_count == 0) {
4210 SvUTF8_on(sv);
4211 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
4212 }
4213 else {
4214 sv_utf8_upgrade_flags_grow(sv,
4215 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
4216 need);
4217 d = SvPVX(sv) + SvCUR(sv);
4218 }
4219 d_is_utf8 = TRUE;
4220 goto default_action; /* Redo, having upgraded so both are UTF-8 */
4221 }
4222 else { /* UTF8ness matters: convert this non-UTF8 source char to
4223 UTF-8 for output. It will occupy 2 bytes, but don't include
4224 the input byte since we haven't incremented 's' yet. See
4225 Note on sizing above. */
4226 const STRLEN off = d - SvPVX(sv);
4227 const STRLEN extra = 2 + (send - s - 1) + 1;
4228 if (off + extra > SvLEN(sv)) {
4229 d = off + SvGROW(sv, off + extra);
4230 }
4231 *d++ = UTF8_EIGHT_BIT_HI(*s);
4232 *d++ = UTF8_EIGHT_BIT_LO(*s);
4233 s++;
4234 }
4235 } /* while loop to process each character */
4237 {
4238 const STRLEN off = d - SvPVX(sv);
4240 /* See if room for the terminating NUL */
4241 if (UNLIKELY(off >= SvLEN(sv))) {
4243 #ifndef DEBUGGING
4245 if (off > SvLEN(sv))
4246 #endif
4247 Perl_croak(aTHX_ "panic: constant overflowed allocated space,"
4248 " %" UVuf " >= %" UVuf, (UV)off, (UV)SvLEN(sv));
4250 /* Whew! Here we don't have room for the terminating NUL, but
4251 * everything else so far has fit. It's not too late to grow
4252 * to fit the NUL and continue on. But it is a bug, as the code
4253 * above was supposed to have made room for this, so under
4254 * DEBUGGING builds, we panic anyway. */
4255 d = off + SvGROW(sv, off + 1);
4256 }
4257 }
4259 /* terminate the string and set up the sv */
4260 *d = '\0';
4261 SvCUR_set(sv, d - SvPVX_const(sv));
4263 SvPOK_on(sv);
4264 if (d_is_utf8) {
4265 SvUTF8_on(sv);
4266 }
4268 /* shrink the sv if we allocated more than we used */
4269 if (SvCUR(sv) + 5 < SvLEN(sv)) {
4270 SvPV_shrink_to_cur(sv);
4271 }
4273 /* return the substring (via pl_yylval) only if we parsed anything */
4274 if (s > start) {
4275 char *s2 = start;
4276 for (; s2 < s; s2++) {
4277 if (*s2 == '\n')
4278 COPLINE_INC_WITH_HERELINES;
4279 }
4280 SvREFCNT_inc_simple_void_NN(sv);
4281 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
4282 && ! PL_parser->lex_re_reparsing)
4283 {
4284 const char *const key = PL_lex_inpat ? "qr" : "q";
4285 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
4286 const char *type;
4287 STRLEN typelen;
4289 if (PL_lex_inwhat == OP_TRANS) {
4290 type = "tr";
4291 typelen = 2;
4292 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
4293 type = "s";
4294 typelen = 1;
4295 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
4296 type = "q";
4297 typelen = 1;
4298 } else {
4299 type = "qq";
4300 typelen = 2;
4301 }
4303 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
4304 type, typelen, NULL);
4305 }
4306 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
4307 }
4308 LEAVE_with_name("scan_const");
4309 return s;
4310 }
4312 /* S_intuit_more
4313 * Returns TRUE if there's more to the expression (e.g., a subscript),
4314 * FALSE otherwise.
4315 *
4316 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
4317 *
4318 * ->[ and ->{ return TRUE
4319 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
4320 * { and [ outside a pattern are always subscripts, so return TRUE
4321 * if we're outside a pattern and it's not { or [, then return FALSE
4322 * if we're in a pattern and the first char is a {
4323 * {4,5} (any digits around the comma) returns FALSE
4324 * if we're in a pattern and the first char is a [
4325 * [] returns FALSE
4326 * [SOMETHING] has a funky algorithm to decide whether it's a
4327 * character class or not. It has to deal with things like
4328 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
4329 * anything else returns TRUE
4330 */
4332 /* This is the one truly awful dwimmer necessary to conflate C and sed. */
4334 STATIC int
4335 S_intuit_more(pTHX_ char *s, char *e)
4336 {
4337 PERL_ARGS_ASSERT_INTUIT_MORE;
4339 if (PL_lex_brackets)
4340 return TRUE;
4341 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
4342 return TRUE;
4343 if (*s == '-' && s[1] == '>'
4344 && FEATURE_POSTDEREF_QQ_IS_ENABLED
4345 && ( (s[2] == '$' && (s[3] == '*' || (s[3] == '#' && s[4] == '*')))
4346 ||(s[2] == '@' && memCHRs("*[{",s[3])) ))
4347 return TRUE;
4348 if (*s != '{' && *s != '[')
4349 return FALSE;
4350 PL_parser->sub_no_recover = TRUE;
4351 if (!PL_lex_inpat)
4352 return TRUE;
4354 /* In a pattern, so maybe we have {n,m}. */
4355 if (*s == '{') {
4356 if (regcurly(s)) {
4357 return FALSE;
4358 }
4359 return TRUE;
4360 }
4362 /* On the other hand, maybe we have a character class */
4364 s++;
4365 if (*s == ']' || *s == '^')
4366 return FALSE;
4367 else {
4368 /* this is terrifying, and it works */
4369 int weight;
4370 char seen[256];
4371 const char * const send = (char *) memchr(s, ']', e - s);
4372 unsigned char un_char, last_un_char;
4373 char tmpbuf[sizeof PL_tokenbuf * 4];
4375 if (!send) /* has to be an expression */
4376 return TRUE;
4377 weight = 2; /* let's weigh the evidence */
4379 if (*s == '$')
4380 weight -= 3;
4381 else if (isDIGIT(*s)) {
4382 if (s[1] != ']') {
4383 if (isDIGIT(s[1]) && s[2] == ']')
4384 weight -= 10;
4385 }
4386 else
4387 weight -= 100;
4388 }
4389 Zero(seen,256,char);
4390 un_char = 255;
4391 for (; s < send; s++) {
4392 last_un_char = un_char;
4393 un_char = (unsigned char)*s;
4394 switch (*s) {
4395 case '@':
4396 case '&':
4397 case '$':
4398 weight -= seen[un_char] * 10;
4399 if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
4400 int len;
4401 scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
4402 len = (int)strlen(tmpbuf);
4403 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
4404 UTF ? SVf_UTF8 : 0, SVt_PV))
4405 weight -= 100;
4406 else
4407 weight -= 10;
4408 }
4409 else if (*s == '$'
4410 && s[1]
4411 && memCHRs("[#!%*<>()-=",s[1]))
4412 {
4413 if (/*{*/ memCHRs("])} =",s[2]))
4414 weight -= 10;
4415 else
4416 weight -= 1;
4417 }
4418 break;
4419 case '\\':
4420 un_char = 254;
4421 if (s[1]) {
4422 if (memCHRs("wds]",s[1]))
4423 weight += 100;
4424 else if (seen[(U8)'\''] || seen[(U8)'"'])
4425 weight += 1;
4426 else if (memCHRs("rnftbxcav",s[1]))
4427 weight += 40;
4428 else if (isDIGIT(s[1])) {
4429 weight += 40;
4430 while (s[1] && isDIGIT(s[1]))
4431 s++;
4432 }
4433 }
4434 else
4435 weight += 100;
4436 break;
4437 case '-':
4438 if (s[1] == '\\')
4439 weight += 50;
4440 if (memCHRs("aA01! ",last_un_char))
4441 weight += 30;
4442 if (memCHRs("zZ79~",s[1]))
4443 weight += 30;
4444 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
4445 weight -= 5; /* cope with negative subscript */
4446 break;
4447 default:
4448 if (!isWORDCHAR(last_un_char)
4449 && !(last_un_char == '$' || last_un_char == '@'
4450 || last_un_char == '&')
4451 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
4452 char *d = s;
4453 while (isALPHA(*s))
4454 s++;
4455 if (keyword(d, s - d, 0))
4456 weight -= 150;
4457 }
4458 if (un_char == last_un_char + 1)
4459 weight += 5;
4460 weight -= seen[un_char];
4461 break;
4462 }
4463 seen[un_char]++;
4464 }
4465 if (weight >= 0) /* probably a character class */
4466 return FALSE;
4467 }
4469 return TRUE;
4470 }
4472 /*
4473 * S_intuit_method
4474 *
4475 * Does all the checking to disambiguate
4476 * foo bar
4477 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
4478 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
4479 *
4480 * First argument is the stuff after the first token, e.g. "bar".
4481 *
4482 * Not a method if foo is a filehandle.
4483 * Not a method if foo is a subroutine prototyped to take a filehandle.
4484 * Not a method if it's really "Foo $bar"
4485 * Method if it's "foo $bar"
4486 * Not a method if it's really "print foo $bar"
4487 * Method if it's really "foo package::" (interpreted as package->foo)
4488 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
4489 * Not a method if bar is a filehandle or package, but is quoted with
4490 * =>
4491 */
4493 STATIC int
4494 S_intuit_method(pTHX_ char *start, SV *ioname, CV *cv)
4495 {
4496 char *s = start + (*start == '$');
4497 char tmpbuf[sizeof PL_tokenbuf];
4498 STRLEN len;
4499 GV* indirgv;
4500 /* Mustn't actually add anything to a symbol table.
4501 But also don't want to "initialise" any placeholder
4502 constants that might already be there into full
4503 blown PVGVs with attached PVCV. */
4504 GV * const gv =
4505 ioname ? gv_fetchsv(ioname, GV_NOADD_NOINIT, SVt_PVCV) : NULL;
4507 PERL_ARGS_ASSERT_INTUIT_METHOD;
4509 if (!FEATURE_INDIRECT_IS_ENABLED)
4510 return 0;
4512 if (gv && SvTYPE(gv) == SVt_PVGV && GvIO(gv))
4513 return 0;
4514 if (cv && SvPOK(cv)) {
4515 const char *proto = CvPROTO(cv);
4516 if (proto) {
4517 while (*proto && (isSPACE(*proto) || *proto == ';'))
4518 proto++;
4519 if (*proto == '*')
4520 return 0;
4521 }
4522 }
4524 if (*start == '$') {
4525 SSize_t start_off = start - SvPVX(PL_linestr);
4526 if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY
4527 || isUPPER(*PL_tokenbuf))
4528 return 0;
4529 /* this could be $# */
4530 if (isSPACE(*s))
4531 s = skipspace(s);
4532 PL_bufptr = SvPVX(PL_linestr) + start_off;
4533 PL_expect = XREF;
4534 return *s == '(' ? FUNCMETH : METHOD;
4535 }
4537 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
4538 /* start is the beginning of the possible filehandle/object,
4539 * and s is the end of it
4540 * tmpbuf is a copy of it (but with single quotes as double colons)
4541 */
4543 if (!keyword(tmpbuf, len, 0)) {
4544 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
4545 len -= 2;
4546 tmpbuf[len] = '\0';
4547 goto bare_package;
4548 }
4549 indirgv = gv_fetchpvn_flags(tmpbuf, len,
4550 GV_NOADD_NOINIT|( UTF ? SVf_UTF8 : 0 ),
4551 SVt_PVCV);
4552 if (indirgv && SvTYPE(indirgv) != SVt_NULL
4553 && (!isGV(indirgv) || GvCVu(indirgv)))
4554 return 0;
4555 /* filehandle or package name makes it a method */
4556 if (!cv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, UTF ? SVf_UTF8 : 0)) {
4557 s = skipspace(s);
4558 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
4559 return 0; /* no assumptions -- "=>" quotes bareword */
4560 bare_package:
4561 NEXTVAL_NEXTTOKE.opval = newSVOP(OP_CONST, 0,
4562 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
4563 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
4564 PL_expect = XTERM;
4565 force_next(BAREWORD);
4566 PL_bufptr = s;
4567 return *s == '(' ? FUNCMETH : METHOD;
4568 }
4569 }
4570 return 0;
4571 }
4573 /* Encoded script support. filter_add() effectively inserts a
4574 * 'pre-processing' function into the current source input stream.
4575 * Note that the filter function only applies to the current source file
4576 * (e.g., it will not affect files 'require'd or 'use'd by this one).
4577 *
4578 * The datasv parameter (which may be NULL) can be used to pass
4579 * private data to this instance of the filter. The filter function
4580 * can recover the SV using the FILTER_DATA macro and use it to
4581 * store private buffers and state information.
4582 *
4583 * The supplied datasv parameter is upgraded to a PVIO type
4584 * and the IoDIRP/IoANY field is used to store the function pointer,
4585 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
4586 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
4587 * private use must be set using malloc'd pointers.
4588 */
4590 SV *
4591 Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
4592 {
4593 if (!funcp)
4594 return NULL;
4596 if (!PL_parser)
4597 return NULL;
4599 if (PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS)
4600 Perl_croak(aTHX_ "Source filters apply only to byte streams");
4602 if (!PL_rsfp_filters)
4603 PL_rsfp_filters = newAV();
4604 if (!datasv)
4605 datasv = newSV(0);
4606 SvUPGRADE(datasv, SVt_PVIO);
4607 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
4608 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
4609 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
4610 FPTR2DPTR(void *, IoANY(datasv)),
4611 SvPV_nolen(datasv)));
4612 av_unshift(PL_rsfp_filters, 1);
4613 av_store(PL_rsfp_filters, 0, datasv) ;
4614 if (
4615 !PL_parser->filtered
4616 && PL_parser->lex_flags & LEX_EVALBYTES
4617 && PL_bufptr < PL_bufend
4618 ) {
4619 const char *s = PL_bufptr;
4620 while (s < PL_bufend) {
4621 if (*s == '\n') {
4622 SV *linestr = PL_parser->linestr;
4623 char *buf = SvPVX(linestr);
4624 STRLEN const bufptr_pos = PL_parser->bufptr - buf;
4625 STRLEN const oldbufptr_pos = PL_parser->oldbufptr - buf;
4626 STRLEN const oldoldbufptr_pos=PL_parser->oldoldbufptr-buf;
4627 STRLEN const linestart_pos = PL_parser->linestart - buf;
4628 STRLEN const last_uni_pos =
4629 PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
4630 STRLEN const last_lop_pos =
4631 PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
4632 av_push(PL_rsfp_filters, linestr);
4633 PL_parser->linestr =
4634 newSVpvn(SvPVX(linestr), ++s-SvPVX(linestr));
4635 buf = SvPVX(PL_parser->linestr);
4636 PL_parser->bufend = buf + SvCUR(PL_parser->linestr);
4637 PL_parser->bufptr = buf + bufptr_pos;
4638 PL_parser->oldbufptr = buf + oldbufptr_pos;
4639 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
4640 PL_parser->linestart = buf + linestart_pos;
4641 if (PL_parser->last_uni)
4642 PL_parser->last_uni = buf + last_uni_pos;
4643 if (PL_parser->last_lop)
4644 PL_parser->last_lop = buf + last_lop_pos;
4645 SvLEN_set(linestr, SvCUR(linestr));
4646 SvCUR_set(linestr, s - SvPVX(linestr));
4647 PL_parser->filtered = 1;
4648 break;
4649 }
4650 s++;
4651 }
4652 }
4653 return(datasv);
4654 }
4657 /* Delete most recently added instance of this filter function. */
4658 void
4659 Perl_filter_del(pTHX_ filter_t funcp)
4660 {
4661 SV *datasv;
4663 PERL_ARGS_ASSERT_FILTER_DEL;
4665 #ifdef DEBUGGING
4666 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_del func %p",
4667 FPTR2DPTR(void*, funcp)));
4668 #endif
4669 if (!PL_parser || !PL_rsfp_filters || AvFILLp(PL_rsfp_filters)<0)
4670 return;
4671 /* if filter is on top of stack (usual case) just pop it off */
4672 datasv = FILTER_DATA(AvFILLp(PL_rsfp_filters));
4673 if (IoANY(datasv) == FPTR2DPTR(void *, funcp)) {
4674 sv_free(av_pop(PL_rsfp_filters));
4676 return;
4677 }
4678 /* we need to search for the correct entry and clear it */
4679 Perl_die(aTHX_ "filter_del can only delete in reverse order (currently)");
4680 }
4683 /* Invoke the idxth filter function for the current rsfp. */
4684 /* maxlen 0 = read one text line */
4685 I32
4686 Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen)
4687 {
4688 filter_t funcp;
4689 I32 ret;
4690 SV *datasv = NULL;
4691 /* This API is bad. It should have been using unsigned int for maxlen.
4692 Not sure if we want to change the API, but if not we should sanity
4693 check the value here. */
4694 unsigned int correct_length = maxlen < 0 ? PERL_INT_MAX : maxlen;
4696 PERL_ARGS_ASSERT_FILTER_READ;
4698 if (!PL_parser || !PL_rsfp_filters)
4699 return -1;
4700 if (idx > AvFILLp(PL_rsfp_filters)) { /* Any more filters? */
4701 /* Provide a default input filter to make life easy. */
4702 /* Note that we append to the line. This is handy. */
4703 DEBUG_P(PerlIO_printf(Perl_debug_log,
4704 "filter_read %d: from rsfp\n", idx));
4705 if (correct_length) {
4706 /* Want a block */
4707 int len ;
4708 const int old_len = SvCUR(buf_sv);
4710 /* ensure buf_sv is large enough */
4711 SvGROW(buf_sv, (STRLEN)(old_len + correct_length + 1)) ;
4712 if ((len = PerlIO_read(PL_rsfp, SvPVX(buf_sv) + old_len,
4713 correct_length)) <= 0) {
4714 if (PerlIO_error(PL_rsfp))
4715 return -1; /* error */
4716 else
4717 return 0 ; /* end of file */
4718 }
4719 SvCUR_set(buf_sv, old_len + len) ;
4720 SvPVX(buf_sv)[old_len + len] = '\0';
4721 } else {
4722 /* Want a line */
4723 if (sv_gets(buf_sv, PL_rsfp, SvCUR(buf_sv)) == NULL) {
4724 if (PerlIO_error(PL_rsfp))
4725 return -1; /* error */
4726 else
4727 return 0 ; /* end of file */
4728 }
4729 }
4730 return SvCUR(buf_sv);
4731 }
4732 /* Skip this filter slot if filter has been deleted */
4733 if ( (datasv = FILTER_DATA(idx)) == &PL_sv_undef) {
4734 DEBUG_P(PerlIO_printf(Perl_debug_log,
4735 "filter_read %d: skipped (filter deleted)\n",
4736 idx));
4737 return FILTER_READ(idx+1, buf_sv, correct_length); /* recurse */
4738 }
4739 if (SvTYPE(datasv) != SVt_PVIO) {
4740 if (correct_length) {
4741 /* Want a block */
4742 const STRLEN remainder = SvLEN(datasv) - SvCUR(datasv);
4743 if (!remainder) return 0; /* eof */
4744 if (correct_length > remainder) correct_length = remainder;
4745 sv_catpvn(buf_sv, SvEND(datasv), correct_length);
4746 SvCUR_set(datasv, SvCUR(datasv) + correct_length);
4747 } else {
4748 /* Want a line */
4749 const char *s = SvEND(datasv);
4750 const char *send = SvPVX(datasv) + SvLEN(datasv);
4751 while (s < send) {
4752 if (*s == '\n') {
4753 s++;
4754 break;
4755 }
4756 s++;
4757 }
4758 if (s == send) return 0; /* eof */
4759 sv_catpvn(buf_sv, SvEND(datasv), s-SvEND(datasv));
4760 SvCUR_set(datasv, s-SvPVX(datasv));
4761 }
4762 return SvCUR(buf_sv);
4763 }
4764 /* Get function pointer hidden within datasv */
4765 funcp = DPTR2FPTR(filter_t, IoANY(datasv));
4766 DEBUG_P(PerlIO_printf(Perl_debug_log,
4767 "filter_read %d: via function %p (%s)\n",
4768 idx, (void*)datasv, SvPV_nolen_const(datasv)));
4769 /* Call function. The function is expected to */
4770 /* call "FILTER_READ(idx+1, buf_sv)" first. */
4771 /* Return: <0:error, =0:eof, >0:not eof */
4772 ENTER;
4773 save_scalar(PL_errgv);
4774 ret = (*funcp)(aTHX_ idx, buf_sv, correct_length);
4775 LEAVE;
4776 return ret;
4777 }
4779 STATIC char *
4780 S_filter_gets(pTHX_ SV *sv, STRLEN append)
4781 {
4782 PERL_ARGS_ASSERT_FILTER_GETS;
4784 #ifdef PERL_CR_FILTER
4785 if (!PL_rsfp_filters) {
4786 filter_add(S_cr_textfilter,NULL);
4787 }
4788 #endif
4789 if (PL_rsfp_filters) {
4790 if (!append)
4791 SvCUR_set(sv, 0); /* start with empty line */
4792 if (FILTER_READ(0, sv, 0) > 0)
4793 return ( SvPVX(sv) ) ;
4794 else
4795 return NULL ;
4796 }
4797 else
4798 return (sv_gets(sv, PL_rsfp, append));
4799 }
4801 STATIC HV *
4802 S_find_in_my_stash(pTHX_ const char *pkgname, STRLEN len)
4803 {
4804 GV *gv;
4806 PERL_ARGS_ASSERT_FIND_IN_MY_STASH;
4808 if (memEQs(pkgname, len, "__PACKAGE__"))
4809 return PL_curstash;
4811 if (len > 2
4812 && (pkgname[len - 2] == ':' && pkgname[len - 1] == ':')
4813 && (gv = gv_fetchpvn_flags(pkgname,
4814 len,
4815 ( UTF ? SVf_UTF8 : 0 ), SVt_PVHV)))
4816 {
4817 return GvHV(gv); /* Foo:: */
4818 }
4820 /* use constant CLASS => 'MyClass' */
4821 gv = gv_fetchpvn_flags(pkgname, len, UTF ? SVf_UTF8 : 0, SVt_PVCV);
4822 if (gv && GvCV(gv)) {
4823 SV * const sv = cv_const_sv(GvCV(gv));
4824 if (sv)
4825 return gv_stashsv(sv, 0);
4826 }
4828 return gv_stashpvn(pkgname, len, UTF ? SVf_UTF8 : 0);
4829 }
4832 STATIC char *
4833 S_tokenize_use(pTHX_ int is_use, char *s) {
4834 PERL_ARGS_ASSERT_TOKENIZE_USE;
4836 if (PL_expect != XSTATE)
4837 /* diag_listed_as: "use" not allowed in expression */
4838 yyerror(Perl_form(aTHX_ "\"%s\" not allowed in expression",
4839 is_use ? "use" : "no"));
4840 PL_expect = XTERM;
4841 s = skipspace(s);
4842 if (isDIGIT(*s) || (*s == 'v' && isDIGIT(s[1]))) {
4843 s = force_version(s, TRUE);
4844 if (*s == ';' || *s == '}'
4845 || (s = skipspace(s), (*s == ';' || *s == '}'))) {
4846 NEXTVAL_NEXTTOKE.opval = NULL;
4847 force_next(BAREWORD);
4848 }
4849 else if (*s == 'v') {
4850 s = force_word(s,BAREWORD,FALSE,TRUE);
4851 s = force_version(s, FALSE);
4852 }
4853 }
4854 else {
4855 s = force_word(s,BAREWORD,FALSE,TRUE);
4856 s = force_version(s, FALSE);
4857 }
4858 pl_yylval.ival = is_use;
4859 return s;
4860 }
4861 #ifdef DEBUGGING
4862 static const char* const exp_name[] =
4863 { "OPERATOR", "TERM", "REF", "STATE", "BLOCK", "ATTRBLOCK",
4864 "ATTRTERM", "TERMBLOCK", "XBLOCKTERM", "POSTDEREF",
4865 "SIGVAR", "TERMORDORDOR"
4866 };
4867 #endif
4869 #define word_takes_any_delimiter(p,l) S_word_takes_any_delimiter(p,l)
4870 STATIC bool
4871 S_word_takes_any_delimiter(char *p, STRLEN len)
4872 {
4873 return (len == 1 && memCHRs("msyq", p[0]))
4874 || (len == 2
4875 && ((p[0] == 't' && p[1] == 'r')
4876 || (p[0] == 'q' && memCHRs("qwxr", p[1]))));
4877 }
4879 static void
4880 S_check_scalar_slice(pTHX_ char *s)
4881 {
4882 s++;
4883 while (SPACE_OR_TAB(*s)) s++;
4884 if (*s == 'q' && s[1] == 'w' && !isWORDCHAR_lazy_if_safe(s+2,
4885 PL_bufend,
4886 UTF))
4887 {
4888 return;
4889 }
4890 while ( isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)
4891 || (*s && memCHRs(" \t$#+-'\"", *s)))
4892 {
4893 s += UTF ? UTF8SKIP(s) : 1;
4894 }
4895 if (*s == '}' || *s == ']')
4896 pl_yylval.ival = OPpSLICEWARNING;
4897 }
4899 #define lex_token_boundary() S_lex_token_boundary(aTHX)
4900 static void
4901 S_lex_token_boundary(pTHX)
4902 {
4903 PL_oldoldbufptr = PL_oldbufptr;
4904 PL_oldbufptr = PL_bufptr;
4905 }
4907 #define vcs_conflict_marker(s) S_vcs_conflict_marker(aTHX_ s)
4908 static char *
4909 S_vcs_conflict_marker(pTHX_ char *s)
4910 {
4911 lex_token_boundary();
4912 PL_bufptr = s;
4913 yyerror("Version control conflict marker");
4914 while (s < PL_bufend && *s != '\n')
4915 s++;
4916 return s;
4917 }
4919 static int
4920 yyl_sigvar(pTHX_ char *s)
4921 {
4922 /* we expect the sigil and optional var name part of a
4923 * signature element here. Since a '$' is not necessarily
4924 * followed by a var name, handle it specially here; the general
4925 * yylex code would otherwise try to interpret whatever follows
4926 * as a var; e.g. ($, ...) would be seen as the var '$,'
4927 */
4929 U8 sigil;
4931 s = skipspace(s);
4932 sigil = *s++;
4933 PL_bufptr = s; /* for error reporting */
4934 switch (sigil) {
4935 case '$':
4936 case '@':
4937 case '%':
4938 /* spot stuff that looks like an prototype */
4939 if (memCHRs("$:@%&*;\\[]", *s)) {
4940 yyerror("Illegal character following sigil in a subroutine signature");
4941 break;
4942 }
4943 /* '$#' is banned, while '$ # comment' isn't */
4944 if (*s == '#') {
4945 yyerror("'#' not allowed immediately following a sigil in a subroutine signature");
4946 break;
4947 }
4948 s = skipspace(s);
4949 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
4950 char *dest = PL_tokenbuf + 1;
4951 /* read var name, including sigil, into PL_tokenbuf */
4952 PL_tokenbuf[0] = sigil;
4953 parse_ident(&s, &dest, dest + sizeof(PL_tokenbuf) - 1,
4954 0, cBOOL(UTF), FALSE, FALSE);
4955 *dest = '\0';
4956 assert(PL_tokenbuf[1]); /* we have a variable name */
4957 }
4958 else {
4959 *PL_tokenbuf = 0;
4960 PL_in_my = 0;
4961 }
4963 s = skipspace(s);
4964 /* parse the = for the default ourselves to avoid '+=' etc being accepted here
4965 * as the ASSIGNOP, and exclude other tokens that start with =
4966 */
4967 if (*s == '=' && (!s[1] || memCHRs("=~>", s[1]) == 0)) {
4968 /* save now to report with the same context as we did when
4969 * all ASSIGNOPS were accepted */
4970 PL_oldbufptr = s;
4972 ++s;
4973 NEXTVAL_NEXTTOKE.ival = 0;
4974 force_next(ASSIGNOP);
4975 PL_expect = XTERM;
4976 }
4977 else if (*s == ',' || *s == ')') {
4978 PL_expect = XOPERATOR;
4979 }
4980 else {
4981 /* make sure the context shows the unexpected character and
4982 * hopefully a bit more */
4983 if (*s) ++s;
4984 while (*s && *s != '$' && *s != '@' && *s != '%' && *s != ')')
4985 s++;
4986 PL_bufptr = s; /* for error reporting */
4987 yyerror("Illegal operator following parameter in a subroutine signature");
4988 PL_in_my = 0;
4989 }
4990 if (*PL_tokenbuf) {
4991 NEXTVAL_NEXTTOKE.ival = sigil;
4992 force_next('p'); /* force a signature pending identifier */
4993 }
4994 break;
4996 case ')':
4997 PL_expect = XBLOCK;
4998 break;
4999 case ',': /* handle ($a,,$b) */
5000 break;
5002 default:
5003 PL_in_my = 0;
5004 yyerror("A signature parameter must start with '$', '@' or '%'");
5005 /* very crude error recovery: skip to likely next signature
5006 * element */
5007 while (*s && *s != '$' && *s != '@' && *s != '%' && *s != ')')
5008 s++;
5009 break;
5010 }
5012 switch (sigil) {
5013 case ',': TOKEN (PERLY_COMMA);
5014 case '$': TOKEN (PERLY_DOLLAR);
5015 case '@': TOKEN (PERLY_SNAIL);
5016 case '%': TOKEN (PERLY_PERCENT_SIGN);
5017 case ')': TOKEN (PERLY_PAREN_CLOSE);
5018 default: TOKEN (sigil);
5019 }
5020 }
5022 static int
5023 yyl_dollar(pTHX_ char *s)
5024 {
5025 CLINE;
5027 if (PL_expect == XPOSTDEREF) {
5028 if (s[1] == '#') {
5029 s++;
5030 POSTDEREF(DOLSHARP);
5031 }
5032 POSTDEREF(PERLY_DOLLAR);
5033 }
5035 if ( s[1] == '#'
5036 && ( isIDFIRST_lazy_if_safe(s+2, PL_bufend, UTF)
5037 || memCHRs("{$:+-@", s[2])))
5038 {
5039 PL_tokenbuf[0] = '@';
5040 s = scan_ident(s + 1, PL_tokenbuf + 1,
5041 sizeof PL_tokenbuf - 1, FALSE);
5042 if (PL_expect == XOPERATOR) {
5043 char *d = s;
5044 if (PL_bufptr > s) {
5045 d = PL_bufptr-1;
5046 PL_bufptr = PL_oldbufptr;
5047 }
5048 no_op("Array length", d);
5049 }
<a id="l5050" href="/perl5.git/blob/HEAD:/toke.c#l5050"