str_ops.c 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. // String primitive operations
  2. //
  3. // These are registered in mypyc.primitives.str_ops.
  4. #include <Python.h>
  5. #include "CPy.h"
  6. PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index) {
  7. if (PyUnicode_READY(str) != -1) {
  8. if (CPyTagged_CheckShort(index)) {
  9. Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
  10. Py_ssize_t size = PyUnicode_GET_LENGTH(str);
  11. if (n < 0)
  12. n += size;
  13. if (n < 0 || n >= size) {
  14. PyErr_SetString(PyExc_IndexError, "string index out of range");
  15. return NULL;
  16. }
  17. enum PyUnicode_Kind kind = (enum PyUnicode_Kind)PyUnicode_KIND(str);
  18. void *data = PyUnicode_DATA(str);
  19. Py_UCS4 ch = PyUnicode_READ(kind, data, n);
  20. PyObject *unicode = PyUnicode_New(1, ch);
  21. if (unicode == NULL)
  22. return NULL;
  23. if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
  24. PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
  25. } else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
  26. PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
  27. } else {
  28. assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
  29. PyUnicode_4BYTE_DATA(unicode)[0] = ch;
  30. }
  31. return unicode;
  32. } else {
  33. PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
  34. return NULL;
  35. }
  36. } else {
  37. PyObject *index_obj = CPyTagged_AsObject(index);
  38. return PyObject_GetItem(str, index_obj);
  39. }
  40. }
  41. // A simplification of _PyUnicode_JoinArray() from CPython 3.9.6
  42. PyObject *CPyStr_Build(Py_ssize_t len, ...) {
  43. Py_ssize_t i;
  44. va_list args;
  45. // Calculate the total amount of space and check
  46. // whether all components have the same kind.
  47. Py_ssize_t sz = 0;
  48. Py_UCS4 maxchar = 0;
  49. int use_memcpy = 1; // Use memcpy by default
  50. PyObject *last_obj = NULL;
  51. va_start(args, len);
  52. for (i = 0; i < len; i++) {
  53. PyObject *item = va_arg(args, PyObject *);
  54. if (!PyUnicode_Check(item)) {
  55. PyErr_Format(PyExc_TypeError,
  56. "sequence item %zd: expected str instance,"
  57. " %.80s found",
  58. i, Py_TYPE(item)->tp_name);
  59. return NULL;
  60. }
  61. if (PyUnicode_READY(item) == -1)
  62. return NULL;
  63. size_t add_sz = PyUnicode_GET_LENGTH(item);
  64. Py_UCS4 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
  65. maxchar = Py_MAX(maxchar, item_maxchar);
  66. // Using size_t to avoid overflow during arithmetic calculation
  67. if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
  68. PyErr_SetString(PyExc_OverflowError,
  69. "join() result is too long for a Python string");
  70. return NULL;
  71. }
  72. sz += add_sz;
  73. // If these strings have different kind, we would call
  74. // _PyUnicode_FastCopyCharacters() in the following part.
  75. if (use_memcpy && last_obj != NULL) {
  76. if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
  77. use_memcpy = 0;
  78. }
  79. last_obj = item;
  80. }
  81. va_end(args);
  82. // Construct the string
  83. PyObject *res = PyUnicode_New(sz, maxchar);
  84. if (res == NULL)
  85. return NULL;
  86. if (use_memcpy) {
  87. unsigned char *res_data = PyUnicode_1BYTE_DATA(res);
  88. unsigned int kind = PyUnicode_KIND(res);
  89. va_start(args, len);
  90. for (i = 0; i < len; ++i) {
  91. PyObject *item = va_arg(args, PyObject *);
  92. Py_ssize_t itemlen = PyUnicode_GET_LENGTH(item);
  93. if (itemlen != 0) {
  94. memcpy(res_data, PyUnicode_DATA(item), kind * itemlen);
  95. res_data += kind * itemlen;
  96. }
  97. }
  98. va_end(args);
  99. assert(res_data == PyUnicode_1BYTE_DATA(res) + kind * PyUnicode_GET_LENGTH(res));
  100. } else {
  101. Py_ssize_t res_offset = 0;
  102. va_start(args, len);
  103. for (i = 0; i < len; ++i) {
  104. PyObject *item = va_arg(args, PyObject *);
  105. Py_ssize_t itemlen = PyUnicode_GET_LENGTH(item);
  106. if (itemlen != 0) {
  107. _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
  108. res_offset += itemlen;
  109. }
  110. }
  111. va_end(args);
  112. assert(res_offset == PyUnicode_GET_LENGTH(res));
  113. }
  114. assert(_PyUnicode_CheckConsistency(res, 1));
  115. return res;
  116. }
  117. PyObject *CPyStr_Split(PyObject *str, PyObject *sep, CPyTagged max_split) {
  118. Py_ssize_t temp_max_split = CPyTagged_AsSsize_t(max_split);
  119. if (temp_max_split == -1 && PyErr_Occurred()) {
  120. PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
  121. return NULL;
  122. }
  123. return PyUnicode_Split(str, sep, temp_max_split);
  124. }
  125. PyObject *CPyStr_Replace(PyObject *str, PyObject *old_substr,
  126. PyObject *new_substr, CPyTagged max_replace) {
  127. Py_ssize_t temp_max_replace = CPyTagged_AsSsize_t(max_replace);
  128. if (temp_max_replace == -1 && PyErr_Occurred()) {
  129. PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
  130. return NULL;
  131. }
  132. return PyUnicode_Replace(str, old_substr, new_substr, temp_max_replace);
  133. }
  134. bool CPyStr_Startswith(PyObject *self, PyObject *subobj) {
  135. Py_ssize_t start = 0;
  136. Py_ssize_t end = PyUnicode_GET_LENGTH(self);
  137. return PyUnicode_Tailmatch(self, subobj, start, end, -1);
  138. }
  139. bool CPyStr_Endswith(PyObject *self, PyObject *subobj) {
  140. Py_ssize_t start = 0;
  141. Py_ssize_t end = PyUnicode_GET_LENGTH(self);
  142. return PyUnicode_Tailmatch(self, subobj, start, end, 1);
  143. }
  144. /* This does a dodgy attempt to append in place */
  145. PyObject *CPyStr_Append(PyObject *o1, PyObject *o2) {
  146. PyUnicode_Append(&o1, o2);
  147. return o1;
  148. }
  149. PyObject *CPyStr_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
  150. if (likely(PyUnicode_CheckExact(obj)
  151. && CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end))) {
  152. Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
  153. Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
  154. if (startn < 0) {
  155. startn += PyUnicode_GET_LENGTH(obj);
  156. if (startn < 0) {
  157. startn = 0;
  158. }
  159. }
  160. if (endn < 0) {
  161. endn += PyUnicode_GET_LENGTH(obj);
  162. if (endn < 0) {
  163. endn = 0;
  164. }
  165. }
  166. return PyUnicode_Substring(obj, startn, endn);
  167. }
  168. return CPyObject_GetSlice(obj, start, end);
  169. }
  170. /* Check if the given string is true (i.e. its length isn't zero) */
  171. bool CPyStr_IsTrue(PyObject *obj) {
  172. Py_ssize_t length = PyUnicode_GET_LENGTH(obj);
  173. return length != 0;
  174. }
  175. Py_ssize_t CPyStr_Size_size_t(PyObject *str) {
  176. if (PyUnicode_READY(str) != -1) {
  177. return PyUnicode_GET_LENGTH(str);
  178. }
  179. return -1;
  180. }
  181. PyObject *CPy_Decode(PyObject *obj, PyObject *encoding, PyObject *errors) {
  182. const char *enc = NULL;
  183. const char *err = NULL;
  184. if (encoding) {
  185. enc = PyUnicode_AsUTF8AndSize(encoding, NULL);
  186. if (!enc) return NULL;
  187. }
  188. if (errors) {
  189. err = PyUnicode_AsUTF8AndSize(errors, NULL);
  190. if (!err) return NULL;
  191. }
  192. if (PyBytes_Check(obj)) {
  193. return PyUnicode_Decode(((PyBytesObject *)obj)->ob_sval,
  194. ((PyVarObject *)obj)->ob_size,
  195. enc, err);
  196. } else {
  197. return PyUnicode_FromEncodedObject(obj, enc, err);
  198. }
  199. }
  200. PyObject *CPy_Encode(PyObject *obj, PyObject *encoding, PyObject *errors) {
  201. const char *enc = NULL;
  202. const char *err = NULL;
  203. if (encoding) {
  204. enc = PyUnicode_AsUTF8AndSize(encoding, NULL);
  205. if (!enc) return NULL;
  206. }
  207. if (errors) {
  208. err = PyUnicode_AsUTF8AndSize(errors, NULL);
  209. if (!err) return NULL;
  210. }
  211. if (PyUnicode_Check(obj)) {
  212. return PyUnicode_AsEncodedString(obj, enc, err);
  213. } else {
  214. PyErr_BadArgument();
  215. return NULL;
  216. }
  217. }