~nabijaczleweli/voreutils

a9102ff1d867f7b461fefedc93a524a95d31c738 — наб 21 days ago 9911525
Add expr
5 files changed, 923 insertions(+), 3 deletions(-)

M Makefile
M README.md
A cmd/expr.cpp
A man/expr.1
A tests/expr
M Makefile => Makefile +2 -2
@@ 133,9 133,9 @@ $(OBJDIR)man/% : man/%
$(MANDIR)man1/% $(MANDIR)man3/% $(MANDIR)man8/% : $(OBJDIR)man/%
	@mkdir -p $(dir $@)
	$(AWK) 'BEGIN { ints=0; tsc=0 }  /^\.TS/,/^\.TE/ { if(!ints) { ints=1; ++tsc; print "_VO_TS" tsc } print > ("$<-TS" tsc); next }  { ints=0; print }' $< | \
		$(AWK) 'BEGIN { inds=0; dsc=0 }  /^\.[da]s/,!/^\.[da]s/ { if(!inds) { inds=1; ++dsc; print "_VO_DS" dsc } if($$0 ~ /^\.[da]s/) { print > ("$<-DS" dsc); next } }  { inds=0; print }' | \
		$(MANDOC) -I os="voreutils $(VOREUTILS_VERSION)" -Tman | \
		grep -vF 'Automatically generated from an mdoc input file.' | \
		$(AWK) '/^_VO_TS/ {tsc=substr($$0, 7); while(getline < ("$<-TS" tsc)) print; next}  {print}' > $@
		$(AWK) '/Automatically generated from an mdoc input file\./ { next }  /^_VO_[TD]S/ {ff = substr($$0, 5); while(getline < ("$<-" ff)) print; next}  { print }' > $@

$(HTMLMANDIR)man1/%.html $(HTMLMANDIR)man3/%.html $(HTMLMANDIR)man8/%.html : $(OBJDIR)man/%
	@mkdir -p $(dir $@)

M README.md => README.md +1 -1
@@ 48,7 48,7 @@ GNU coreutils provide the following 105 binaries, according to `dpkg -L coreutil
  * ☐ /usr/bin/du
  * ☑ /usr/bin/env – some parsing restrixions might be too strict (-0 in particular), but they match GNU env
  * ☑ /usr/bin/expand
  * ☐ /usr/bin/expr
  * ☑ /usr/bin/expr
  * ☑ /usr/bin/factor – only u64 (for now?); could also use -hx from NetBSD?  also should maybe include primes(6)
  * ☐ /usr/bin/fmt
  * ☐ /usr/bin/fold

A cmd/expr.cpp => cmd/expr.cpp +461 -0
@@ 0,0 1,461 @@
// SPDX-License-Identifier: 0BSD


#include <algorithm>
#include <array>
#include <cinttypes>
#include <clocale>
#include <cstring>
#include <memory>
#include <optional>
#include <regex.h>
#include <string>
#include <string_view>
#include <variant>
#include <vore-dupa>
#include <vore-numeric>
#include <vore-optarg>
#include <vore-print>
#include <vore-span>
#include <vore-stdio>
#include <vore-visit>


using namespace std::literals;

static const char * self{};


struct value;
struct match;
struct arithmetic;
struct comparison;
struct alternative;
struct conjunxion;
struct length;
struct index_;  // vs. index(3)
struct substr;

using expr_t = std::unique_ptr<std::variant<value, match, arithmetic, comparison, alternative, conjunxion, length, index_, substr>>;


using conststr = std::array<char, 21>;
struct value {
	std::variant<std::string_view, conststr> string;
	std::optional<std::int64_t> integer;  // also used in ARITHMETIC_BUILTIN

	constexpr std::string_view str() const noexcept {
		return std::visit(vore::overload{[](const std::string_view & sv) { return sv; }, [](const conststr & cs) { return std::string_view{cs.data()}; }},
		                  this->string);
	}
};

/// rgx is basic regex, must match at start; if rgx has matches: first match or null; if not: amount of characters matcheed or 0
struct match {
	expr_t str, rgx;
};

/// lhs op rhs; must be integers; exit 2 for division 0; exit 3 for overflow
enum class arithmetic_t : std::uint8_t { add, sub, mul, div, mod };
struct arithmetic {
	arithmetic_t arth;
	expr_t lhs, rhs;
};

/// lhs is integer and rhs is integer => lhs op rhs; else collated(lhs op rhs); 1 or 0
enum class comparison_t : std::uint8_t { eq, ne, lt, le, ge, gt };
struct comparison {
	comparison_t cmp;
	expr_t lhs, rhs;
};

/// ( lhs != 0 && lhs != null ) ? lhs : rhs != null ? rhs : 0
struct alternative {
	expr_t lhs, rhs;
};

/// ( lhs != 0 && lhs != null ) && ( rhs != 0 && rhs != null ) ? lhs : 0
struct conjunxion {
	expr_t lhs, rhs;
};

/// wcslen(str)
struct length {
	expr_t str;
};

/// [w]str.find_first_of([w]chars), 1-based or 0
struct index_ {
	expr_t str, chars;
};

/// [w]str.substr(pos, len), pos 1-based and clamped to wcslen(str), null if either pos or len not integer, or if pos is 0
struct substr {
	expr_t str, pos, len;
};


// These must be sorted for the std::binary_search()!
static const constexpr std::string_view binary_ops_0   = "|"sv;
static const constexpr std::string_view binary_ops_1   = "&"sv;
static const constexpr std::string_view binary_ops_2[] = {"!="sv, "<"sv, "<="sv, "="sv, ">"sv, ">="sv};
static const constexpr std::string_view binary_ops_3[] = {"+"sv, "-"sv};
static const constexpr std::string_view binary_ops_4[] = {"%"sv, "*"sv, "/"sv};
static const constexpr std::string_view binary_ops_5   = ":"sv;

static const constexpr vore::span<const std::string_view *> binary_ops[] = {{&binary_ops_0, &binary_ops_0 + 1},
                                                                            {&binary_ops_1, &binary_ops_1 + 1},
                                                                            {std::begin(binary_ops_2), std::end(binary_ops_2)},
                                                                            {std::begin(binary_ops_3), std::end(binary_ops_3)},
                                                                            {std::begin(binary_ops_4), std::end(binary_ops_4)},
                                                                            {&binary_ops_5, &binary_ops_5 + 1}};


#define ARITHMETIC_BASE(op, ...)                                                                                \
	[](const value & lhs, const value & rhs) {                                                                    \
		if(lhs.integer && rhs.integer)                                                                              \
			__VA_ARGS__                                                                                               \
		else                                                                                                        \
			std::fprintf(stderr, "%s: %s %c %s: %s not an integer\n", self, lhs.str().data(), *#op, rhs.str().data(), \
			             lhs.integer ? rhs.str().data() : lhs.str().data()),                                          \
			    std::exit(2);                                                                                         \
	}
#define ARITHMETIC_BUILTIN(op, blt)                                                                                                    \
	ARITHMETIC_BASE(op, {                                                                                                                \
		std::int64_t ret;                                                                                                                  \
		if(__builtin_##blt##_overflow(*lhs.integer, *rhs.integer, &ret))                                                                   \
			std::fprintf(stderr, "%s: %s %c %s: %s\n", self, lhs.str().data(), *#op, rhs.str().data(), std::strerror(ERANGE)), std::exit(3); \
		else                                                                                                                               \
			return ret;                                                                                                                      \
	})
#define ARITHMETIC_DIVISION(op)                                                                                         \
	ARITHMETIC_BASE(op, {                                                                                                 \
		if(*rhs.integer)                                                                                                    \
			return *lhs.integer op * rhs.integer;                                                                             \
		else                                                                                                                \
			std::fprintf(stderr, "%s: %s %c %s: %c 0\n", self, lhs.str().data(), *#op, rhs.str().data(), *#op), std::exit(2); \
	})

static const constexpr char arithmetic_s[]             = {'+', '-', '*', '/', '%'};
using arithmetic_fn_t                                  = std::int64_t (*)(const value &, const value &);
static const constexpr arithmetic_fn_t arithmetic_fn[] = {ARITHMETIC_BUILTIN(+, add), ARITHMETIC_BUILTIN(-, sub), ARITHMETIC_BUILTIN(*, mul),  //
                                                          ARITHMETIC_DIVISION(/), ARITHMETIC_DIVISION(%)};


#define COMPARISON(op)                                              \
	[](const value & lhs, const value & rhs) {                        \
		if(lhs.integer && rhs.integer)                                  \
			return *lhs.integer op * rhs.integer;                         \
		else                                                            \
			return std::strcoll(lhs.str().data(), rhs.str().data()) op 0; \
	}
static const constexpr std::string_view comparison_s[]  = {"="sv, "!="sv, "<"sv, "<="sv, ">="sv, ">"sv};
using comparison_fn__t                                  = bool (*)(const value &, const value &);  // glibc has a comparison_fn_t at global scope! very cool!
static const constexpr comparison_fn__t comparison_fn[] = {COMPARISON(==), COMPARISON(!=), COMPARISON(<), COMPARISON(<=), COMPARISON(>=), COMPARISON(>)};


static value parse(std::string_view data) {
	value ret{data, {}};
	if(data[0] == '-' || std::isdigit(data[0]))
		if(!vore::parse_sint<10>(data.data()[data.size()] == '\0' ? data.data() : strndupa(data.data(), data.size()), ret.integer.emplace()))
			ret.integer = {};
	return ret;
};

#define MAKE_EXPR(...) std::make_unique<expr_t::element_type>(std::in_place_type_t<decltype(__VA_ARGS__)>{}, __VA_ARGS__)

// Pratt parser loosely based on https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html
static std::optional<std::uint8_t> prefix_precedence(const std::string_view & of) {
	if(of == "match"sv || of == "length"sv || of == "index"sv || of == "substr"sv || of == "+"sv)
		return (std::distance(std::begin(binary_ops), std::end(binary_ops)) + 1) * 2;  // max
	else
		return {};
}

static std::optional<std::pair<std::uint8_t, std::uint8_t>> infix_precedence(const std::string_view & of) {
	if(auto itr = std::find_if(std::begin(binary_ops), std::end(binary_ops), [&](auto && sp) { return std::binary_search(std::begin(sp), std::end(sp), of); });
	   itr != std::end(binary_ops))
		return std::pair<std::uint8_t, std::uint8_t>{(itr - std::begin(binary_ops)) * 2, (itr - std::begin(binary_ops)) * 2 + 1};
	else
		return {};
}

static expr_t make_binary(const std::string_view & from, expr_t && lhs, expr_t && rhs) {
#define MB(tp, ...) MAKE_EXPR(tp{__VA_ARGS__ std::move(lhs), std::move(rhs)})
	if(from == "|"sv)
		return MB(alternative, );
	else if(from == "&"sv)
		return MB(conjunxion, );
	else if(auto itr = std::find(std::begin(comparison_s), std::end(comparison_s), from); itr != std::end(comparison_s))
		return MB(comparison, static_cast<comparison_t>(itr - std::begin(comparison_s)), );
	else if(auto itr = from.size() == 1 ? std::find(std::begin(arithmetic_s), std::end(arithmetic_s), from[0]) : std::end(arithmetic_s);
	        itr != std::end(arithmetic_s))
		return MB(arithmetic, static_cast<arithmetic_t>(itr - std::begin(arithmetic_s)), );
	else if(from == ":"sv)
		return MB(match, );
	else
		__builtin_unreachable();
}
static expr_t ingest(const char * const *& argv, std::uint8_t min_prec) {
	if(!*argv)
		std::fprintf(stderr, "%s: not enough tokens\n", self), std::exit(2);
	std::string_view tok{*argv++};

	expr_t lhs;
	if(tok == "("sv) {
		lhs = ingest(argv, 0);
		if(!*argv || *argv++ != ")"sv)
			std::fprintf(stderr, "%s: (: unmatched parentheses\n", self), std::exit(2);
	} else if(tok == ")"sv)
		std::fprintf(stderr, "%s: ): unmatched parentheses\n", self), std::exit(2);
	else {
		if(auto prec = prefix_precedence(tok)) {
			if(tok == "match"sv)
				lhs = MAKE_EXPR(match{ingest(argv, *prec), ingest(argv, *prec)});
			else if(tok == "length"sv)
				lhs = MAKE_EXPR(length{ingest(argv, *prec)});
			else if(tok == "index"sv)
				lhs = MAKE_EXPR(index_{ingest(argv, *prec), ingest(argv, *prec)});
			else if(tok == "substr"sv)
				lhs = MAKE_EXPR(substr{ingest(argv, *prec), ingest(argv, *prec), ingest(argv, *prec)});
			else if(tok == "+"sv) {
				if(!*argv)
					std::fprintf(stderr, "%s: +: not enough tokens\n", self), std::exit(2);

				lhs = MAKE_EXPR(parse(*argv++));
			} else
				__builtin_unreachable();
		} else
			lhs = MAKE_EXPR(parse(tok));
	}

	while(*argv) {
		tok = *argv;  // guaranteed op

		if(auto prec = infix_precedence(tok)) {
			if(prec->first < min_prec)
				break;
			++argv;

			lhs = make_binary(tok, std::move(lhs), ingest(argv, prec->second));
			continue;
		}

		break;
	}

	return lhs;
}

static void dump(const expr_t & expr);
static void dump_bin(const expr_t & lhs, const char * op, const expr_t & rhs) {
	vore::fputc('(', stderr);
	dump(lhs);
	vore::fputc(' ', stderr), vore::fputs(op, stderr), vore::fputc(' ', stderr);
	dump(rhs);
	vore::fputc(')', stderr);
}
template <class... T>
static void dump_fn(const char * nm, const T &... args) {
	vore::fputc('(', stderr);
	vore::fputs(nm, stderr);
	((vore::fputc(' ', stderr), dump(args)), ...);
	vore::fputc(')', stderr);
}
static void dump(const expr_t & expr) {
	std::visit(vore::overload{
	               [](const value & val) { vore::fputc('\'', stderr), std::fwrite(val.str().data(), 1, val.str().size(), stderr), vore::fputc('\'', stderr); },
	               [](const match & mch) { dump_bin(mch.str, ":", mch.rgx); },
	               [](const arithmetic & arth) {
		               char buf[2]{arithmetic_s[static_cast<std::uint8_t>(arth.arth)], '\0'};
		               dump_bin(arth.lhs, buf, arth.rhs);
	               },
	               [](const comparison & cmp) { dump_bin(cmp.lhs, comparison_s[static_cast<std::uint8_t>(cmp.cmp)].data(), cmp.rhs); },
	               [](const alternative & alt) { dump_bin(alt.lhs, "|", alt.rhs); },    //
	               [](const conjunxion & conj) { dump_bin(conj.lhs, "&", conj.rhs); },  //
	               [](const length & len) { dump_fn("length", len.str); },              //
	               [](const index_ & idx) { dump_fn("index_", idx.str, idx.chars); },   //
	               [](const substr & ss) { dump_fn("substr", ss.str, ss.pos, ss.len); }},
	           *expr);
}
static const expr_t & dump_all(const expr_t & expr) {
	if(std::getenv("EXPR_DUMP")) {  // TODO: remove the entire dump thing later
		dump(expr);
		vore::fputc('\n', stderr);
	}
	return expr;
}

static value int_value(std::int64_t i) {
	value ret{conststr{}, i};
	std::snprintf(std::get<conststr>(ret.string).data(), std::get<conststr>(ret.string).size(), "%" PRId64 "", i);
	return ret;
}

static value eval_length(std::string_view cur) {
	std::int64_t ret{};
	for(std::mbstate_t ctx{}; !cur.empty();)
		switch(auto r = std::mbrlen(cur.data(), cur.size(), &ctx)) {
			case static_cast<std::size_t>(-2):  // incomplete
				++ret;
				goto break2;
			case static_cast<std::size_t>(-1):  // EILSEQ: reset, try to go past
				std::mbrtowc(nullptr, nullptr, 0, &ctx);
				[[fallthrough]];
			case 0:
				r = 1;
				[[fallthrough]];
			default:
				cur = cur.substr(r);
				++ret;
				break;
		}
break2:;

	return int_value(ret);
}
static value eval(const expr_t & expr) {
	return std::visit(
	    vore::overload{[](const value & val) { return val; },
	                   [](const match & mch) {
		                   auto str = eval(mch.str);
		                   auto rgx = eval(mch.rgx);

		                   regex_t r __attribute__((__cleanup__(regfree)));
		                   if(auto err = regcomp(&r, rgx.str().data(), 0)) {
			                   auto es = regerror(err, &r, nullptr, 0);
			                   auto eb = static_cast<char *>(std::malloc(es));
			                   if(eb)
				                   regerror(err, &r, eb, es);
			                   std::fprintf(stderr, "%s: %.*s : %.*s: invalid expression%s%s\n", self, (int)str.str().size(), str.str().data(), (int)rgx.str().size(),
			                                rgx.str().data(), eb ? ": " : "", eb ?: ""),
			                       std::exit(2);
		                   }

		                   regmatch_t rm[2];
		                   if(!regexec(&r, str.str().data(), sizeof(rm) / sizeof(*rm), rm, 0) && rm[0].rm_so == 0) {
			                   if(rm[1].rm_so != -1)
				                   return parse(str.str().substr(rm[1].rm_so, rm[1].rm_eo - rm[1].rm_so));
			                   else
				                   return eval_length(str.str().substr(0, rm[0].rm_eo));
		                   } else
			                   return r.re_nsub ? value{""sv, {}} : value{"0"sv, 0};
	                   },
	                   [](const arithmetic & arth) { return int_value(arithmetic_fn[static_cast<std::uint8_t>(arth.arth)](eval(arth.lhs), eval(arth.rhs))); },
	                   [](const comparison & cmp) { return int_value(comparison_fn[static_cast<std::uint8_t>(cmp.cmp)](eval(cmp.lhs), eval(cmp.rhs))); },
	                   [](const alternative & alt) {
		                   auto lhs = eval(alt.lhs);
		                   if(lhs.integer ? !*lhs.integer : lhs.str().empty()) {
			                   auto rhs = eval(alt.rhs);
			                   return rhs.str().empty() ? value{"0"sv, 0} : rhs;
		                   } else
			                   return lhs;
	                   },
	                   [](const conjunxion & conj) {
		                   auto lhs = eval(conj.lhs);
		                   auto rhs = eval(conj.rhs);
		                   if((lhs.integer ? !*lhs.integer : lhs.str().empty()) || (rhs.integer ? !*rhs.integer : rhs.str().empty()))
			                   return value{"0"sv, 0};
		                   else
			                   return lhs;
	                   },
	                   [](const length & len) { return eval_length(eval(len.str).str()); },
	                   [](const index_ & idx) {
		                   auto conv = [](auto && val) {
			                   std::wstring ret;
			                   std::mbstate_t ctx{};
			                   wchar_t c;
			                   for(auto cur = val.str(); !cur.empty();)
				                   switch(auto r = std::mbrtowc(&c, cur.data(), cur.size(), &ctx)) {
					                   case static_cast<std::size_t>(-2):  // incomplete
						                   ret.push_back(L'\0');             // see below
						                   goto break2;
					                   case static_cast<std::size_t>(-1):  // EILSEQ: reset, try to go past
						                   std::mbrtowc(nullptr, nullptr, 0, &ctx);
						                   // need same width as length, &c.; we will never encounter a NUL in the input because it originates from argv
						                   ret.push_back(L'\0');
						                   cur = cur.substr(1);
						                   break;
					                   case 0:
						                   r = 1;
						                   [[fallthrough]];
					                   default:
						                   cur = cur.substr(r);
						                   ret.push_back(c);
						                   break;
				                   }
		                   break2:;
			                   return ret;
		                   };

		                   return int_value(conv(eval(idx.str)).find_first_of(conv(eval(idx.chars))) + 1);  // unsigned overflow into 0
	                   },
	                   [](const substr & ss) {
		                   auto pos = eval(ss.pos).integer.value_or(0);
		                   if(pos <= 0)
			                   return value{""sv, {}};  // matches GNU expr behaviour
		                   --pos;

		                   auto len = eval(ss.len).integer.value_or(0);
		                   if(len <= 0)
			                   return value{""sv, {}};  // non-integer matches GNU expr; 0 is short path


		                   std::size_t idx{};
		                   const char *start{}, *end{};
		                   auto commit = [&](auto & cur, auto next) {
			                   if(idx == static_cast<std::size_t>(pos))
				                   start = cur;
			                   ++idx;
			                   if(idx >= static_cast<std::size_t>(pos) && idx <= static_cast<std::size_t>(pos + len))
				                   end = next;
			                   cur = next;
		                   };

		                   auto str = eval(ss.str);
		                   auto s   = str.str();
		                   std::mbstate_t ctx{};
		                   for(auto cur = s.data(); s.size() - idx && idx <= static_cast<std::size_t>(pos + len);)
			                   switch(auto r = std::mbrlen(cur, s.size() - (cur - s.data()), &ctx)) {
				                   case static_cast<std::size_t>(-2):  // incomplete
					                   commit(cur, s.data() + s.size());
					                   goto break2;
				                   case static_cast<std::size_t>(-1):  // EILSEQ: reset, try to go past
					                   std::mbrtowc(nullptr, nullptr, 0, &ctx);
					                   [[fallthrough]];
				                   case 0:
					                   r = 1;
					                   [[fallthrough]];
				                   default:
					                   commit(cur, cur + r);
					                   break;
			                   }
	                   break2:;

		                   if(!start)
			                   return value{""sv, {}};
		                   else
			                   return parse(s.substr(start - s.data(), end - start));
	                   }},
	    *expr);
}


int main(int, const char * const * argv) {
	std::setlocale(LC_ALL, "");  // TODO: locale!

	self = argv[0];
	if(self)
		++argv;
	if(*argv && *argv == "--"sv)
		++argv;

	auto val = eval(dump_all(ingest(argv, 0)));
	if(*argv)
		return std::fprintf(stderr, "%s: %s: extraneous token\n", self, *argv), 2;

	vore::fwrite(val.str().data(), 1, val.str().size(), stdout);
	vore::fputc('\n', stdout);

	return vore::flush_stdout(self) ? 3 : !(val.integer ? *val.integer : !val.str().empty());
}

A man/expr.1 => man/expr.1 +397 -0
@@ 0,0 1,397 @@
.\" SPDX-License-Identifier: 0BSD
.\"
.Dd
.Dt EXPR 1
.Os
.
.Sh NAME
.Nm expr
.Nd evaluate expression
.Sh SYNOPSIS
.Nm
.Cm \&( Ar expr Cm \&) Oo Cm op Ar expr Oc Ns …
.Nm
.Cm + Ar argument , Cm match Ar string regex , Cm length Ar string , Cm index Ar string characters , Cm substr Ar string position length Oo Cm op Ar expr Oc Ns …
.Nm
.Ar string Cm \&: Ar regex Oo Cm op Ar expr Oc Ns …
.Nm
.Ar integer Bro Cm * , / , % Brc Ar integer Oo Cm op Ar expr Oc Ns …
.Nm
.Ar integer Bro Cm + , \- Brc Ar integer Oo Cm op Ar expr Oc Ns …
.Nm
.Ar expr Bro Cm < , <= , = , != , >= , > Brc Ar expr Oo Cm op Ar expr Oc Ns …
.Nm
.Ar expr Cm & Ar expr Oo Cm op Ar expr Oc Ns …
.Nm
.Ar expr Cm \&| Ar expr Oo Cm op Ar expr Oc Ns …
.
.Sh DESCRIPTION
Writes the evaluation of the expression given as the arguments, followed by a newline, to the standard output stream.
Many of the operators
.Pq Cm ()*<>&|\&
are special in shells \(em make sure to escape or stringify them.
.Pp
An expression qualifies as a number if it's a signed 64-bit integer
.Pq Bq Em \-9223372036854775808 , 9223372036854775807 ,
decimal, with only the optional
.Sq \-
allowed.
.Pp
All indices are
.Sy 1 Ns -based
according to characters in the current locale.
Each invalid multi-byte sequence is a separate character, but regular expressions stop matching at invalid sequences.
.
.Ss Operators
In chunked descending precedence; all binary operators left-associative.
.Pp
.Bl -tag -compact -width "match string regex"
.It Cm \&( Ar expr Cm \&)
.Ar expr
.Pp
.
.It Cm + Ar argument
Special case: immediately consumes
.Ar argument
(the next token) as a value, regardless of any special meaning.
.
.It Cm match Ar string regex
.Ar string Cm \&: Ar regex
.
.It Cm length Ar string
Character count in
.Ar string .
.
.It Cm index Ar string characters
The first position in
.Ar string
of any character from
.Ar characters ,
or
.Sy 0
if none.
.
.It Cm substr Ar string position length
.Bq Ar position , position No + Ar length
subsection of
.Ar string .
Empty if
.Ar position
or
.Ar length
are \[<=]
.Sy 0
or not integers.
.Pp
.
.It Ar string Cm \&: Ar regex
The length, of the match of the basic regular expression
.Ar regex
matched to
.Ar string ,
anchored to the beginning
.Pq i.e. Ar regex No must match the start of Ar string No \(em this is similar to prepending a Qo Li ^ Qc to Ar regex ,
or
.Sy 0
if none.
.br
If
.Ar regex
has a capture group, evaluates to the first capture group
.Pq Li \e1 ,
or the null string if the match failed, instead.
.Pp
.
.\" Strictly, this should be expr-l not expr\-l (expr-dash-l not expr-minus-l), but it italicises better in -Tps, so
.It Ar "int  " Cm * Ar "int  "
Product of
.Ar int Ns s .
.
.It Ar int\-l Cm / Ar int\-r
.Ar int\-l
divided by
.Ar int\-r .
.
.It Ar int\-l Cm % Ar int\-r
Remainder from division of
.Ar int\-l
by
.Ar int\-r .
.Pp
.
.
.It Ar "int  " Cm + Ar "int  "
Sum of
.Ar int Ns s .
.
.It Ar int\-l Cm \- Ar int\-r
.Ar int\-r
subtracted from
.Ar int\-l .
.Pp
.
.It Ar expr Cm "< " Ar expr
.It Ar expr Cm "<=" Ar expr
.It Ar expr Cm " =" Ar expr
.It Ar expr Cm "!=" Ar expr
.It Ar expr Cm ">=" Ar expr
.It Ar expr Cm "> " Ar expr
If both expressions are integers, the result
.Pq Sy 0 No or Sy 1
of the corresponding comparison.
Otherwise, the result of the corresponding comparison between the strings according to the current locale's collating sequence (dictionary order).
.Pp
.
.It Ar expr\-l Cm & Ar expr\-r
If neither expression is the null string or
.Sy 0 :
.Ar expr\-l .
Otherwise
.Sy 0 .
.Pp
.
.It Ar expr\-l Cm \&| Ar expr\-r
If
.Ar expr\-l
is neither the null string nor
.Sy 0 :
.Ar expr\-l .
Otherwise, if
.Ar expr\-r
isn't the null string:
.Ar expr\-r .
Otherwise
.Sy 0 .
.Pp
.
.It Ar expr\-l Cm & Ar expr\-r
.Ar expr\-l
if neither expression is the null string or
.Sy 0 ;
otherwise
.Sy 0 .
.Pp
.
.It Ar expr\-l Cm \&| Ar expr\-r
.Ar expr\-l
if neither the null string nor
.Sy 0 ;
otherwise
.Ar expr\-r
if not the null string; otherwise
.Sy 0 .
.El
.
.Sh ENVIRONMENT
.Bl -tag -compact -width "EXPR_DUMP"
.It Ev EXPR_DUMP
If set, writes the final parse tree with parentheses around every expression, to the standard error stream.
This is a debugging feature and will be removed.
.\" TODO: remove
.El
.
.Sh EXIT STATUS
.Bl -tag -compact -width "I"
.It Sy 0
The expression evaluated to neither the null string nor
.Sy 0 .
.It Sy 1
The expression evaluated to the null string or
.Sy 0 .
.It Sy 2
Syntax error in expression, non-integer passed to an arithmetic operator, or division by zero.
.It Sy 3
Arithmetic overflow in
.Cm * , + , No or Cm \- .
.El
.
.Sh EXAMPLES
.Bd -literal -compact
.Li $ Nm Li 2 Cm + Li 2 Cm \e* Li 2
6
.Li $ Nm Cm \e( Li 2 Cm \e) + \e( Li 17 Cm \e* Li 2 Cm \e- Li 30 Cm \e) \e* \e( Li 5 Cm \e) + Li 2 Cm - \e( Li 8 Cm / Li 2 Cm \e) \e* Li 4
8

.Li $ Ev file Ns Li = Ns "'Makefile'\&;" Nm Li \&" Ns Ev $file Ns Li \&" Cm \&: Li "'.*/\e(.*\e)'" Cm \e| Li \&" Ns Ev $file Ns Li \&"
Makefile
.Li $ Ev file Ns Li = Ns "'/usr/src/voreutils/Makefile'\&;" Nm Li …
Makefile

.Li $ Ev file Ns Li = Ns "'Makefile'\&;" Nm Li \&" Ns Ev $file Ns Li \&" Cm \&: Li "'\e(/\e)[^/]*$'" Cm \e| Li \&" Ns Ev $file Ns Li \&" Cm \&: Li "'\e(.*\e)/'" Cm \e| Li '.'
\&.
.Li $ Ev file Ns Li = Ns "'/Makefile'\&;" Nm Li …
/
.Li $ Ev file Ns Li = Ns "'/usr/src/voreutils/Makefile'\&;" Nm Li …
/usr/src/voreutils

# However
.Li $ Ev file Ns Li = Ns "'length'\&;" Nm Li \&" Ns Ev $file Ns Li \&" Cm \&: Li "'.*/\e(.*\e)'" Cm \e| Li \&" Ns Ev $file Ns Li \&"
\&expr: .*/\e(.*\e): extraneous token
.Li $ Ev file Ns Li = Ns "'length'\&;" Nm Cm + Li \&" Ns Ev $file Ns Li \&" Cm \&: Li "'.*/\e(.*\e)'" Cm \e| + Li \&" Ns Ev $file Ns Li \&"
length
.Ed
.Pp
As part of a
.Xr sh 1
program:
.Bd -literal -compact
#!/bin/sh
.Nm Ar $# Cm \e<= Li 5 >/dev/null || {
.Li "   " Nm echo Li \&" Ns Ar $0 : Li Too many arguments" >&2
.Li "   " Nm exit Li 1
}
.Ed
.
.Sh SEE ALSO
Most arithmetic operations can be done using a
.Xr sh 1
arithmetic expression
.Pq Li $(( Ar expr Li )) ,
and basic string manipulation with parameter expansion operators
.Pq the Xr basename 1 Ns -like above can be written as Ev ${file##*/} , Cm length Li \&" Ns Ev $var Ns Li \&" No is Ev ${#var} , No &c.\& ;
these should be preferred for simple uses in new applications, as they're built into the shell and avoid unary operator SNAFUs.
.Pp
.Xr test 1 ,
.Xr strcoll 3 ,
.Xr mbrtowc 3 ,
.Xr locale 7 ,
.Xr regex 7
.
.Sh STANDARDS
Conforms to
.St -p1003.1-2008 ;
.Cm length , substr , index , No and Cm match
are explicitly unspecified, for compatibility with
.At v7 ,
and are scarcely supported in
.No non- Ns At
.Nm Ns s
.Pq Nx No supports Cm length , No citing GNU system compatibility; the list ends here .
Unary
.Cm +
is an extension, originating from the GNU system.
.Pp
Some
.Nm
implementations accept flags
.Pq like Fx Ns 's Fl e
\(em be wary of the first argument starting with a
.Fl ,
or start the argument list with a
.Fl - .
.
.Sh HISTORY
Appears in Edition 1.0 of The
.Tn PWB/UNIX
User's Manual, allowing
.Cm () , |&+-*/% , substr , length , No and Cm index ,
with the binary operators corresponding solely to their C equivalents on 16-bit
.Vt int Ns s .
.Pp
Edition 2.3 of The
.Tn CB-UNIX
Programmer's Manual sees 32-bit numbers,
.Cm \&| , & , Bro Cm = , > , >= , < , <= , != Brc , Cm +- , */% , No and Cm \&: ,
with
.Cm substr , length , No and Cm index
listed as
.Sx ARCHAIC FORMS .
.Cm \&|
is described simpler, as
.Ar expr\-l
if not nullary and
.Ar expr\-r
otherwise
.Pq with no Sy 0 Ns -folding ,
but the global behaviour is described as
.D1 Note that Sy 0 No is returned to indicate a zero value, rather than the null string.
The current-day behaviour matches and falls out of this.
The comparison operators for non-integers are byte-wise, owing to no system localisation.
.Cm \&:
rejects patterns with more than one capture group, but is otherwise as present-day.
Integer arguments to
.Cm substr
now default to
.Sy 0
instead of being required to be integers.
.Pp
.St -p1003.1-2008
notes that on some systems
.Cm \&:
is documented as literally injecting a
.Sy ^ ,
supposedly making another one in the pattern plain text, despite not doing so and selecting the match some other way \(em this is the case here.
Of interest is also that the
.Sx ARCHAIC FORMS
are such because they "have been made obsolete by the : operator" \(em the suggested replacements are:
.Bl -tag -compact -offset Ds -width "substr expra exprb exprc"
.It Cm substr Ar expra exprb exprc
Given
.Cm substr Ar abcd 2 2 :
.Ar abcd Cm \&: Ar '..\e(..\e)'
\(em this is mostly reasonable, but more accurate as
.Ar '..\e(..\e?\e)' ,
and more generic as
.Ar '.\e{2\e}\e(.\e{1,2\e}\e)' .
.It Cm length Ar expr
.Ar expr Cm \&: Ar '.*'
.It Cm index Ar expra exprb
Given
.Cm index Ar abcd d :
.Ar abcd Cm \&: Ar d .
Not even close!
This is approximately seven centimeters down from explaining how
.Cm \&:
is anchored and what that entails.
Recreating
.Cm index
is very likely impossible with
.Cm \&: ,
even for a simple single-letter case.
.El
.Pp
.Bl -tag -compact -offset Ds -width "substr abcd 2 2 "
.It Cm substr Ar abcd 2 2
.Ar abcd Cm \&: Ar '..\e(..\e)'
\(em this is mostly reasonable, but more accurate as
.Ar '..\e(..\e?\e)' ,
and more generic as
.Ar '.\e{2\e}\e(.\e{1,2\e}\e)' .
.It Cm length Ar expr
.Ar expr Cm \&: Ar '.*'
.It Cm index Ar abcd d
.Ar abcd Cm \&: Ar d .
Not even close!
This is approximately seven centimeters down from explaining how
.Cm \&:
is anchored and what that entails.
Recreating
.Cm index
is very likely impossible with
.Cm \&: ,
even for a simple single-letter case.
.El
.Cm match
is also available, but wholly undocumented.  \" assuming it's the same as SysIII
.Pp
.At III
inherits the
.Tn CB-UNIX
manual page but strips it of the unary operators.
.Pp
.At V
removes
.Cm substr , length , No and Cm index .
.Pp
.At v7 ,
on the other hand, sees an
.Nm
compatible with
.Tn CB-UNIX Ns 's ,
but with an unrelated manual page, not mentioning the unary operators at all.
.Pp
.Bx 4.4
errors on
.Cm /%
dividing by zero instead of performing the division
.Pq which resolves to zero on the Tn PDP-11 No but a Dv SIGFPE No on the Tn VAX .

A tests/expr => tests/expr +62 -0
@@ 0,0 1,62 @@
#!/bin/sh
# SPDX-License-Identifier: 0BSD

tmpdir="$(mktemp -dt "expr.XXXXXXXXXX")/"
expr="${CMDDIR}expr"
seq="${CMDDIR}seq"
localedef -i cs_CZ -c -f UTF-8 "${tmpdir}/cs_CZ.UTF-8" &
localedef -i en_GB -c -f UTF-8 "${tmpdir}/en_GB.UTF-8" &


[ "$(  "$expr" 2 + 2 \* 2)" = '6' ] || echo "expr: expr.1-1.1 wrong" >&3
out="$("$expr" 2 + 2 \* 2   -  6)"  && echo "expr: expr.1-1.1+ okay?" >&3
[ "$out" = '0'  ]                   || echo "expr: expr.1-1.1+ wrong" >&3

[ "$(  "$expr" \( 2 \) + \( 17 \* 2 \- 30 \) \* \( 5 \) + 2 - \( 8 / 2 \) \* 4)" = '8' ] || echo "expr: expr.1-1.2 wrong" >&3
out="$("$expr" \( 2 \) + \( 17 \* 2 \- 30 \) \* \( 5 \) + 2 - \( 8 / 2 \) \* 4   -  8)"  && echo "expr: expr.1-1.2+ okay?" >&3
[ "$out" = '0'  ]                                                                        || echo "expr: expr.1-1.2+ wrong" >&3


file='Makefile';                    [ "$("$expr" "$file" : '.*/\(.*\)' \| "$file")" = 'Makefile' ] || echo "expr: expr.1-2.1 wrong" >&3
file='/Makefile';                   [ "$("$expr" "$file" : '.*/\(.*\)' \| "$file")" = 'Makefile' ] || echo "expr: expr.1-2+  wrong" >&3
file='/usr/src/voreutils/Makefile'; [ "$("$expr" "$file" : '.*/\(.*\)' \| "$file")" = 'Makefile' ] || echo "expr: expr.1-2.2 wrong" >&3

file='Makefile';                    [ "$("$expr" "$file" : '\(/\)[^/]*$' \| "$file" : '\(.*\)/' \| '.')" = '.'                  ] || echo "expr: expr.1-3.1 wrong" >&3
file='/Makefile';                   [ "$("$expr" "$file" : '\(/\)[^/]*$' \| "$file" : '\(.*\)/' \| '.')" = '/'                  ] || echo "expr: expr.1-3.2 wrong" >&3
file='/usr/src/voreutils/Makefile'; [ "$("$expr" "$file" : '\(/\)[^/]*$' \| "$file" : '\(.*\)/' \| '.')" = '/usr/src/voreutils' ] || echo "expr: expr.1-3.3 wrong" >&3


file='length'; out="$("$expr" "$file" : '.*/\(.*\)' \| "$file" 2>"${tmpdir}err")"; err=$?
[ "$err" -eq 2 ]      || echo "expr: expr.1-4.1 okay?" >&3
[ -s "${tmpdir}err" ] || echo "expr: expr.1-4.1 empty stderr" >&3
[ -s "${tmpdir}out" ] && echo "expr: expr.1-4.1 non-empty stdout" >&3

file='length'; [ "$("$expr" + "$file" : '.*/\(.*\)' \| + "$file")" = 'length' ] || echo "expr: expr.1-4.2 wrong" >&3


for i in $("$seq" -10 5); do
	"$expr" $i '<=' 5 >/dev/null || echo "expr: expr.1-5a failed?" >&3
done
for i in $("$seq" 6 20); do
	"$expr" $i '<=' 5 >/dev/null && echo "expr: expr.1-5b okay?" >&3
done


out="$("$expr" -10 \<  +10  2>&3)" && echo "expr: -10 < +10 okay?" >&3
out="$("$expr" -10 \< ' 10' 2>&3)" && echo "expr: -10 <  10 okay?" >&3
out="$("$expr" -10 \<   10  2>&3)" || echo "expr: -10 < 10  failed?" >&3


wait
out="$(LOCPATH="${tmpdir}" LC_ALL=en_GB.UTF-8 "$expr" H \< ch 2>&3)" && echo "expr: H < ch en_GB okay?" >&3
out="$(LOCPATH="${tmpdir}" LC_ALL=cs_CZ.UTF-8 "$expr" H \< ch 2>&3)" || echo "expr: H < ch cs_CZ failed?" >&3
rm -rf "$tmpdir" 2>&3

! [ -w '/dev/full' ] && {
  echo "expr: skipping error testing, /dev/full unavailable" >&2
  exit
}

errmsg="$("$expr" 1 2>&1 > /dev/full)"; err=$?
[ "$err" = 3 ]   || echo "expr: /dev/full okay?" >&3
[ -n "$errmsg" ] || echo "expr: no message after /dev/full?" >&3